|
15 | 15 | from multiprocessing import Process, Queue
|
16 | 16 | from itertools import starmap, chain, islice
|
17 | 17 |
|
| 18 | +from boto3.exceptions import RetriesExceededError, S3UploadFailedError |
18 | 19 | from boto3.s3.transfer import TransferConfig
|
| 20 | +from botocore.config import Config |
| 21 | +from botocore.exceptions import ClientError, SSLError |
19 | 22 |
|
20 | 23 | try:
|
21 | 24 | # python2
|
|
46 | 49 | import metaflow.tracing as tracing
|
47 | 50 | from metaflow.metaflow_config import (
|
48 | 51 | S3_WORKER_COUNT,
|
| 52 | + S3_CLIENT_RETRY_CONFIG, |
49 | 53 | )
|
50 | 54 |
|
51 | 55 | DOWNLOAD_FILE_THRESHOLD = 2 * TransferConfig().multipart_threshold
|
52 | 56 | DOWNLOAD_MAX_CHUNK = 2 * 1024 * 1024 * 1024 - 1
|
53 | 57 |
|
| 58 | +DEFAULT_S3_CLIENT_PARAMS = {"config": Config(retries=S3_CLIENT_RETRY_CONFIG)} |
54 | 59 | RANGE_MATCH = re.compile(r"bytes (?P<start>[0-9]+)-(?P<end>[0-9]+)/(?P<total>[0-9]+)")
|
55 | 60 |
|
56 | 61 | S3Config = namedtuple("S3Config", "role session_vars client_params")
|
@@ -147,6 +152,7 @@ def normalize_client_error(err):
|
147 | 152 | "LimitExceededException",
|
148 | 153 | "RequestThrottled",
|
149 | 154 | "EC2ThrottledException",
|
| 155 | + "InternalError", |
150 | 156 | ):
|
151 | 157 | return 503
|
152 | 158 | return error_code
|
@@ -254,21 +260,27 @@ def op_info(url):
|
254 | 260 | except client_error as err:
|
255 | 261 | tmp.close()
|
256 | 262 | os.unlink(tmp.name)
|
257 |
| - error_code = normalize_client_error(err) |
258 |
| - if error_code == 404: |
259 |
| - result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND)) |
260 |
| - continue |
261 |
| - elif error_code == 403: |
262 |
| - result_file.write( |
263 |
| - "%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED) |
264 |
| - ) |
265 |
| - continue |
266 |
| - elif error_code == 503: |
267 |
| - result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
268 |
| - continue |
269 |
| - else: |
270 |
| - raise |
271 |
| - # TODO specific error message for out of disk space |
| 263 | + handle_client_error(err, idx, result_file) |
| 264 | + continue |
| 265 | + except RetriesExceededError as e: |
| 266 | + tmp.close() |
| 267 | + os.unlink(tmp.name) |
| 268 | + err = convert_to_client_error(e) |
| 269 | + handle_client_error(err, idx, result_file) |
| 270 | + continue |
| 271 | + except SSLError as e: |
| 272 | + tmp.close() |
| 273 | + os.unlink(tmp.name) |
| 274 | + result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
| 275 | + result_file.flush() |
| 276 | + continue |
| 277 | + except Exception as e: |
| 278 | + tmp.close() |
| 279 | + os.unlink(tmp.name) |
| 280 | + # assume anything else is transient |
| 281 | + result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
| 282 | + result_file.flush() |
| 283 | + continue |
272 | 284 | # If we need the metadata, get it and write it out
|
273 | 285 | if pre_op_info:
|
274 | 286 | with open("%s_meta" % url.local, mode="w") as f:
|
@@ -322,22 +334,61 @@ def op_info(url):
|
322 | 334 | # We indicate that the file was uploaded
|
323 | 335 | result_file.write("%d %d\n" % (idx, 0))
|
324 | 336 | except client_error as err:
|
325 |
| - error_code = normalize_client_error(err) |
326 |
| - if error_code == 403: |
327 |
| - result_file.write( |
328 |
| - "%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED) |
329 |
| - ) |
330 |
| - continue |
331 |
| - elif error_code == 503: |
332 |
| - result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
333 |
| - continue |
334 |
| - else: |
335 |
| - raise |
| 337 | + # shouldn't get here, but just in case |
| 338 | + handle_client_error(err, idx, result_file) |
| 339 | + continue |
| 340 | + except S3UploadFailedError as e: |
| 341 | + err = convert_to_client_error(e) |
| 342 | + handle_client_error(err, idx, result_file) |
| 343 | + continue |
| 344 | + except SSLError as e: |
| 345 | + result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
| 346 | + result_file.flush() |
| 347 | + continue |
| 348 | + except Exception as e: |
| 349 | + # assume anything else is transient |
| 350 | + result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
| 351 | + result_file.flush() |
| 352 | + continue |
336 | 353 | except:
|
337 | 354 | traceback.print_exc()
|
| 355 | + result_file.flush() |
338 | 356 | sys.exit(ERROR_WORKER_EXCEPTION)
|
339 | 357 |
|
340 | 358 |
|
| 359 | +def convert_to_client_error(e): |
| 360 | + match = re.search( |
| 361 | + r"An error occurred \((\w+)\) when calling the (\w+) operation.*: (.+)", str(e) |
| 362 | + ) |
| 363 | + assert match, "Failed to parse error message" |
| 364 | + error_code = match.group(1) |
| 365 | + operation_name = match.group(2) |
| 366 | + error_message = match.group(3) |
| 367 | + response = { |
| 368 | + "Error": { |
| 369 | + "Code": error_code, |
| 370 | + "Message": error_message, |
| 371 | + } |
| 372 | + } |
| 373 | + return ClientError(response, operation_name) |
| 374 | + |
| 375 | + |
| 376 | +def handle_client_error(err, idx, result_file): |
| 377 | + error_code = normalize_client_error(err) |
| 378 | + if error_code == 404: |
| 379 | + result_file.write("%d %d\n" % (idx, -ERROR_URL_NOT_FOUND)) |
| 380 | + result_file.flush() |
| 381 | + elif error_code == 403: |
| 382 | + result_file.write("%d %d\n" % (idx, -ERROR_URL_ACCESS_DENIED)) |
| 383 | + result_file.flush() |
| 384 | + elif error_code == 503: |
| 385 | + result_file.write("%d %d\n" % (idx, -ERROR_TRANSIENT)) |
| 386 | + result_file.flush() |
| 387 | + else: |
| 388 | + raise |
| 389 | + # TODO specific error message for out of disk space |
| 390 | + |
| 391 | + |
341 | 392 | def start_workers(mode, urls, num_workers, inject_failure, s3config):
|
342 | 393 | # We start the minimum of len(urls) or num_workers to avoid starting
|
343 | 394 | # workers that will definitely do nothing
|
@@ -381,6 +432,12 @@ def start_workers(mode, urls, num_workers, inject_failure, s3config):
|
381 | 432 | if proc.exitcode is not None:
|
382 | 433 | if proc.exitcode != 0:
|
383 | 434 | msg = "Worker process failed (exit code %d)" % proc.exitcode
|
| 435 | + |
| 436 | + # IMPORTANT: if a child process has put items on a queue, then that process will not |
| 437 | + # terminate until all buffered items have been flushed to the pipe, causing a deadlock. |
| 438 | + # `cancel_join_thread()` allows the subprocess to exit without flushing the queue. |
| 439 | + queue.cancel_join_thread() |
| 440 | + |
384 | 441 | exit(msg, proc.exitcode)
|
385 | 442 | # Read the output file if all went well
|
386 | 443 | with open(out_path, "r") as out_file:
|
@@ -745,7 +802,7 @@ def lst(
|
745 | 802 | s3config = S3Config(
|
746 | 803 | s3role,
|
747 | 804 | json.loads(s3sessionvars) if s3sessionvars else None,
|
748 |
| - json.loads(s3clientparams) if s3clientparams else None, |
| 805 | + json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS, |
749 | 806 | )
|
750 | 807 |
|
751 | 808 | urllist = []
|
@@ -878,7 +935,7 @@ def _make_url(idx, local, user_url, content_type, metadata, encryption):
|
878 | 935 | s3config = S3Config(
|
879 | 936 | s3role,
|
880 | 937 | json.loads(s3sessionvars) if s3sessionvars else None,
|
881 |
| - json.loads(s3clientparams) if s3clientparams else None, |
| 938 | + json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS, |
882 | 939 | )
|
883 | 940 |
|
884 | 941 | urls = list(starmap(_make_url, _files()))
|
@@ -1025,7 +1082,7 @@ def get(
|
1025 | 1082 | s3config = S3Config(
|
1026 | 1083 | s3role,
|
1027 | 1084 | json.loads(s3sessionvars) if s3sessionvars else None,
|
1028 |
| - json.loads(s3clientparams) if s3clientparams else None, |
| 1085 | + json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS, |
1029 | 1086 | )
|
1030 | 1087 |
|
1031 | 1088 | # Construct a list of URL (prefix) objects
|
@@ -1172,7 +1229,7 @@ def info(
|
1172 | 1229 | s3config = S3Config(
|
1173 | 1230 | s3role,
|
1174 | 1231 | json.loads(s3sessionvars) if s3sessionvars else None,
|
1175 |
| - json.loads(s3clientparams) if s3clientparams else None, |
| 1232 | + json.loads(s3clientparams) if s3clientparams else DEFAULT_S3_CLIENT_PARAMS, |
1176 | 1233 | )
|
1177 | 1234 |
|
1178 | 1235 | # Construct a list of URL (prefix) objects
|
|
0 commit comments