Skip to content

Commit acecc40

Browse files
authored
feat: Add fetch_archive_from_http to fetch zip or gzip archives (#7806)
* Add fetch_archive_from_http to fetch zip or gzip archives * Add release notes
1 parent 4188bf9 commit acecc40

File tree

2 files changed

+63
-5
lines changed

2 files changed

+63
-5
lines changed

haystack/utils/import_utils.py

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
1-
import logging
1+
import gzip
22
import importlib
33
import importlib.util
4-
from typing import Optional, Tuple, List
5-
from urllib.parse import urlparse, unquote
6-
from os.path import splitext, basename
4+
import io
5+
import logging
6+
import zipfile
7+
from os.path import basename, splitext
8+
from pathlib import Path
9+
from typing import Dict, List, Optional, Tuple, Union
10+
from urllib.parse import unquote, urlparse
11+
12+
import requests
713

814
from haystack.errors import DatasetsError
915
from haystack.schema import Document
1016

11-
1217
logger = logging.getLogger(__name__)
1318

1419

@@ -55,5 +60,54 @@ def get_filename_extension_from_url(url: str) -> Tuple[str, str]:
5560
return file_name, archive_extension
5661

5762

63+
def fetch_archive_from_http(
64+
url: str,
65+
output_dir: str,
66+
proxies: Optional[Dict[str, str]] = None,
67+
timeout: Union[float, Tuple[float, float]] = 10.0,
68+
) -> bool:
69+
"""
70+
Fetch an archive (zip or gz) from a url via http and extract content to an output directory.
71+
:param url: http address
72+
:param output_dir: local path
73+
:param proxies: proxies details as required by requests library
74+
:param timeout: How many seconds to wait for the server to send data before giving up,
75+
as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple.
76+
Defaults to 10 seconds.
77+
:return: if anything got fetched
78+
"""
79+
# verify & prepare local directory
80+
path = Path(output_dir)
81+
if not path.exists():
82+
path.mkdir(parents=True)
83+
84+
is_not_empty = len(list(Path(path).rglob("*"))) > 0
85+
if is_not_empty:
86+
logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir)
87+
return False
88+
else:
89+
logger.info("Fetching from %s to '%s'", url, output_dir)
90+
91+
file_name, archive_extension = get_filename_extension_from_url(url)
92+
request_data = requests.get(url, proxies=proxies, timeout=timeout)
93+
94+
if archive_extension == "zip":
95+
zip_archive = zipfile.ZipFile(io.BytesIO(request_data.content))
96+
zip_archive.extractall(output_dir)
97+
elif archive_extension == "gz" and not "tar.gz" in url:
98+
gzip_archive = gzip.GzipFile(fileobj=io.BytesIO(request_data.content))
99+
file_content = gzip_archive.read()
100+
with open(f"{output_dir}/{file_name}", "wb") as file:
101+
file.write(file_content)
102+
else:
103+
logger.warning(
104+
"Skipped url %s as file type is not supported here. "
105+
"See haystack documentation for support of more file types",
106+
url,
107+
)
108+
109+
return True
110+
111+
58112
def is_whisper_available():
59113
return importlib.util.find_spec("whisper") is not None
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
Add previously removed `fetch_archive_from_http` util function to fetch zip and gzip archives from url

0 commit comments

Comments
 (0)