Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 30 additions & 4 deletions python/yugabyte/download_and_extract_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
EXPECTED_ARCHIVE_EXTENSION = '.tar.gz'
CHECKSUM_EXTENSION = '.sha256'
ARTIFACT_URL_SUFFIX = '/zip'
MAX_DOWNLOAD_ATTEMPTS = 5
RETRY_DELAY_SEC = 5


def remove_ignore_errors(file_path: str) -> None:
Expand Down Expand Up @@ -98,11 +100,18 @@ def verify_sha256sum(checksum_file_path: str, data_file_path: str) -> bool:
raise ValueError("Checksum file path must end with '%s', got: %s" % (
CHECKSUM_EXTENSION, checksum_file_path))

# Guard against someone passing in the actual data file instead of the checksum file.
# Guard against someone passing in the actual data file instead of the checksum file, or
# against the server returning an HTML error page in place of the checksum.
checksum_file_size = os.stat(checksum_file_path).st_size
if checksum_file_size > 4096:
raise IOError("Checksum file size is too big: %d bytes (file path: %s)" % (
checksum_file_size, checksum_file_path))
try:
with open(checksum_file_path, 'rb') as f:
preview = f.read(1024).decode('utf-8', errors='replace')
except Exception as ex:
preview = "<failed to read file: %s>" % ex
raise IOError(
"Checksum file size is too big: %d bytes (file path: %s). First 1024 bytes:\n%s" % (
checksum_file_size, checksum_file_path, preview))

expected_checksum = read_file_and_strip(checksum_file_path).split()[0]

Expand All @@ -122,7 +131,24 @@ def download_url(url: str, dest_path: str, other_curl_flags: List[str] = []) ->
dest_dir = os.path.dirname(dest_path)
if not os.path.isdir(dest_dir):
raise IOError("Destination directory %s does not exist" % dest_dir)
run_cmd(['curl', '-LsS', url, '-o', dest_path] + other_curl_flags)
# -f: exit 22 on HTTP errors instead of saving the error page as the artifact.
# Retry at the Python level so any curl failure is retried, including transient
# non-5xx HTTP errors (e.g. 403s on GitHub's signed release-asset redirects)
# that curl's --retry skips. --retry-all-errors would cover this but requires
# curl >= 7.71.0, which is unavailable on AlmaLinux 8 / RHEL 8 and similar.
cmd = ['curl', '-LsSf', url, '-o', dest_path] + other_curl_flags
for attempt in range(1, MAX_DOWNLOAD_ATTEMPTS + 1):
try:
run_cmd(cmd)
break
except subprocess.CalledProcessError as ex:
if attempt == MAX_DOWNLOAD_ATTEMPTS:
raise
logging.warning(
"curl failed downloading %s (attempt %d/%d) with exit code %d, "
"retrying in %ds",
url, attempt, MAX_DOWNLOAD_ATTEMPTS, ex.returncode, RETRY_DELAY_SEC)
time.sleep(RETRY_DELAY_SEC)
if not os.path.exists(dest_path):
raise IOError("Failed to download %s: file %s does not exist" % (url, dest_path))
elapsed_sec = time.time() - start_time_sec
Expand Down