From 3aa3af699f7cc873c4d38ca7a97a65699479d5c2 Mon Sep 17 00:00:00 2001 From: Sooraj Sivadasan Date: Sat, 29 Nov 2025 16:26:50 +0530 Subject: [PATCH 1/5] fix: modify `url_download` to do multipart download using multi-threading when supported by the server, which shows better performance and consumes less resources If server doesn't give `content-length` or if there is any `data` to POST into, switch back to `_url_download` --- avocado/utils/download.py | 97 ++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 16 deletions(-) diff --git a/avocado/utils/download.py b/avocado/utils/download.py index 3fb6dbce08..9e883d2ccf 100644 --- a/avocado/utils/download.py +++ b/avocado/utils/download.py @@ -22,8 +22,9 @@ import shutil import socket import sys +import time import urllib.parse -from multiprocessing import Process +from concurrent.futures import ThreadPoolExecutor from urllib.error import HTTPError from urllib.request import urlopen @@ -31,6 +32,8 @@ log = logging.getLogger(__name__) +USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + def url_open(url, data=None, timeout=5): """ @@ -83,24 +86,86 @@ def _url_download(url, filename, data): src_file.close() -def url_download(url, filename, data=None, timeout=300): +def _download_range( + url: str, + start: int, + end: int, + part_file: str, + retries: int = 3, + delay: int = 1, + timeout: int = 300, +): """ - Retrieve a file from given url. - - :param url: source URL. - :param filename: destination path. - :param data: (optional) data to post. + Downloads file within start and end byte range into part_files + :param url: URL to download. + :param start: Start byte range. + :param end: End byte range. + :param part_file: Part file name. + :param retries: Number of retries. + :param delay: Delay in seconds. + :param timeout: Timeout in seconds. + """ + for attempt in range(1, retries + 1): + try: + req = urllib.request.Request( + url, headers={"User-Agent": USER_AGENT, "Range": f"bytes={start}-{end}"} + ) + with urlopen(req, timeout=timeout) as r, open(part_file, "wb") as f: + shutil.copyfileobj(r, f, length=1024 * 1024) # 1MB chunks + return + except (socket.timeout, TimeoutError, HTTPError) as e: + if attempt == retries: + raise e + time.sleep(delay) + + +def url_download( + url: str, + filename: str, + data: bytes | None = None, + timeout: int = 300, + segments: int = 4, +): + """ + Multipart downloader using thread pool executor by content-length splitting + :param url: URL to download. + :param filename: Filename to save the downloaded file + :param data: (optional) data to POST Request :param timeout: (optional) default timeout in seconds. - :return: `None`. + :param segments: How much should we split the download into """ - process = Process(target=_url_download, args=(url, filename, data)) - log.info("Fetching %s -> %s", url, filename) - process.start() - process.join(timeout) - if process.is_alive(): - process.terminate() - process.join() - raise OSError("Aborting downloading. Timeout was reached.") + headers = urllib.request.urlopen( + urllib.request.Request(url, method="HEAD"), timeout=10 + ).headers # Using HEAD method to get the content length + size = int(headers.get("Content-Length", -1)) + supports_range = "bytes" in headers.get("Accept-Ranges", "").lower() + + if size <= 0 or data or not supports_range: + # if the server doesn't provide the size or accepted range / if we want sent data to the server (POST Method), + # switch to single download with urlopen + _url_download(url=url, filename=filename, data=data) + return + + part_size = size // segments + + def task(i: int): + start = i * part_size + end = size - 1 if i == segments - 1 else (start + part_size - 1) + part_file = f"{filename}.part{i}" + _download_range( + url=url, start=start, end=end, part_file=part_file, timeout=timeout + ) + return part_file + + with ThreadPoolExecutor(max_workers=segments) as executor: + part_files = list(executor.map(task, range(segments))) + + # Merge the split files and remove them + with open(filename, "wb") as f: + for part in part_files: + with open(part, "rb") as pf: + shutil.copyfileobj(pf, f) + os.remove(part) def url_download_interactive(url, output_file, title="", chunk_size=102400): From 52656fa9dc354fb9cc12aa0791e74ef5fee7ca69 Mon Sep 17 00:00:00 2001 From: Sooraj Sivadasan Date: Sat, 29 Nov 2025 19:34:30 +0530 Subject: [PATCH 2/5] fix: change bytes|None = None to `typing.Optional[bytes]` to support lower python versions --- avocado/utils/download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/avocado/utils/download.py b/avocado/utils/download.py index 9e883d2ccf..c28f8ed0fd 100644 --- a/avocado/utils/download.py +++ b/avocado/utils/download.py @@ -23,6 +23,7 @@ import socket import sys import time +import typing import urllib.parse from concurrent.futures import ThreadPoolExecutor from urllib.error import HTTPError @@ -122,7 +123,7 @@ def _download_range( def url_download( url: str, filename: str, - data: bytes | None = None, + data: typing.Optional[bytes] = None, timeout: int = 300, segments: int = 4, ): From c97438ce6e978300f637ba672e7b0261b74d5989 Mon Sep 17 00:00:00 2001 From: Sooraj Sivadasan Date: Sat, 29 Nov 2025 20:32:07 +0530 Subject: [PATCH 3/5] fix: modify functional test for `test_empty_test_list` to use `assertIn` instead of `assertEqual` due to the error shown about deprecated pkg_resources --- selftests/functional/basic.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/selftests/functional/basic.py b/selftests/functional/basic.py index 360b3aeeb4..bce00bf385 100644 --- a/selftests/functional/basic.py +++ b/selftests/functional/basic.py @@ -647,13 +647,10 @@ def test_empty_test_list(self): ) result = process.run(cmd_line, ignore_status=True) self.assertEqual(result.exit_status, exit_codes.AVOCADO_JOB_FAIL) - self.assertEqual( - result.stderr, - ( - b"Test Suite could not be created. No test references" - b" provided nor any other arguments resolved into " - b"tests\n" - ), + # TODO Modify this test to assertEqual, once the issue `deprecated pkg_resources` is solved + self.assertIn( + "Test Suite could not be created. No test references provided nor any other arguments resolved into tests", + str(result.stderr), ) def test_not_found(self): From a9d7c22409e8f0b3a4028862d47a94c9ed4b28ed Mon Sep 17 00:00:00 2001 From: Sooraj Sivadasan Date: Sat, 13 Dec 2025 10:31:08 +0530 Subject: [PATCH 4/5] fix: reverted empty_test_list test to use assertEqual instead of assertIn --- selftests/functional/basic.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/selftests/functional/basic.py b/selftests/functional/basic.py index bce00bf385..360b3aeeb4 100644 --- a/selftests/functional/basic.py +++ b/selftests/functional/basic.py @@ -647,10 +647,13 @@ def test_empty_test_list(self): ) result = process.run(cmd_line, ignore_status=True) self.assertEqual(result.exit_status, exit_codes.AVOCADO_JOB_FAIL) - # TODO Modify this test to assertEqual, once the issue `deprecated pkg_resources` is solved - self.assertIn( - "Test Suite could not be created. No test references provided nor any other arguments resolved into tests", - str(result.stderr), + self.assertEqual( + result.stderr, + ( + b"Test Suite could not be created. No test references" + b" provided nor any other arguments resolved into " + b"tests\n" + ), ) def test_not_found(self): From 63a6aec0c5e59b1ac832e9da00c475a22bdf83bf Mon Sep 17 00:00:00 2001 From: Sooraj Sivadasan Date: Sun, 28 Dec 2025 13:21:01 +0530 Subject: [PATCH 5/5] Handle multi-part download edge cases and filename usage. --- avocado/utils/download.py | 48 +++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/avocado/utils/download.py b/avocado/utils/download.py index c28f8ed0fd..466a349dbd 100644 --- a/avocado/utils/download.py +++ b/avocado/utils/download.py @@ -28,6 +28,7 @@ from concurrent.futures import ThreadPoolExecutor from urllib.error import HTTPError from urllib.request import urlopen +from pathlib import Path from avocado.utils import crypto, output @@ -125,12 +126,12 @@ def url_download( filename: str, data: typing.Optional[bytes] = None, timeout: int = 300, - segments: int = 4, + segments: int = 1, ): """ Multipart downloader using thread pool executor by content-length splitting :param url: URL to download. - :param filename: Filename to save the downloaded file + :param filename: Target file name or full file path :param data: (optional) data to POST Request :param timeout: (optional) default timeout in seconds. :param segments: How much should we split the download into @@ -141,32 +142,49 @@ def url_download( size = int(headers.get("Content-Length", -1)) supports_range = "bytes" in headers.get("Accept-Ranges", "").lower() - if size <= 0 or data or not supports_range: - # if the server doesn't provide the size or accepted range / if we want sent data to the server (POST Method), - # switch to single download with urlopen + if segments == 1 or size <= 0 or data or not supports_range: + # Use single download when size/range is unavailable, request is POST, or segment size is 1 _url_download(url=url, filename=filename, data=data) return part_size = size // segments + path = Path(filename) # takes absolute path + part_files = [str(path.parent / f"temp{i}_{path.name}") for i in range(segments)] + if part_size == 0: + # File too small for segmentation, fall back to single download + _url_download(url=url, filename=filename, data=data) + return def task(i: int): start = i * part_size end = size - 1 if i == segments - 1 else (start + part_size - 1) - part_file = f"{filename}.part{i}" + part_file = part_files[i] _download_range( url=url, start=start, end=end, part_file=part_file, timeout=timeout ) - return part_file - with ThreadPoolExecutor(max_workers=segments) as executor: - part_files = list(executor.map(task, range(segments))) - - # Merge the split files and remove them - with open(filename, "wb") as f: + try: + with ThreadPoolExecutor(max_workers=segments) as executor: + # This will raise an exception if any task fails + list(executor.map(task, range(segments))) + # Merge the split files + with open(filename, "wb") as f: + for part in part_files: + with open(part, "rb") as pf: + shutil.copyfileobj(pf, f) + except Exception as e: + # If anything fails, remove the incomplete destination file and switch to single-part download + if os.path.exists(filename): + os.remove(filename) + log.warning( + "Multipart download failed (%s). Falling back to single-part download.", e + ) + _url_download(url=url, filename=filename, data=data) + finally: + # Always clean up part files for part in part_files: - with open(part, "rb") as pf: - shutil.copyfileobj(pf, f) - os.remove(part) + if os.path.exists(part): + os.remove(part) def url_download_interactive(url, output_file, title="", chunk_size=102400):