Skip to content

Commit

Permalink
feat: Add fetch_archive_from_http to fetch zip or gzip archives (#7806
Browse files Browse the repository at this point in the history
)

* Add fetch_archive_from_http to fetch zip or gzip archives

* Add release notes
  • Loading branch information
silvanocerza authored Jun 5, 2024
1 parent 4188bf9 commit acecc40
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 5 deletions.
64 changes: 59 additions & 5 deletions haystack/utils/import_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
import logging
import gzip
import importlib
import importlib.util
from typing import Optional, Tuple, List
from urllib.parse import urlparse, unquote
from os.path import splitext, basename
import io
import logging
import zipfile
from os.path import basename, splitext
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from urllib.parse import unquote, urlparse

import requests

from haystack.errors import DatasetsError
from haystack.schema import Document


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -55,5 +60,54 @@ def get_filename_extension_from_url(url: str) -> Tuple[str, str]:
return file_name, archive_extension


def fetch_archive_from_http(
url: str,
output_dir: str,
proxies: Optional[Dict[str, str]] = None,
timeout: Union[float, Tuple[float, float]] = 10.0,
) -> bool:
"""
Fetch an archive (zip or gz) from a url via http and extract content to an output directory.
:param url: http address
:param output_dir: local path
:param proxies: proxies details as required by requests library
:param timeout: How many seconds to wait for the server to send data before giving up,
as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple.
Defaults to 10 seconds.
:return: if anything got fetched
"""
# verify & prepare local directory
path = Path(output_dir)
if not path.exists():
path.mkdir(parents=True)

is_not_empty = len(list(Path(path).rglob("*"))) > 0
if is_not_empty:
logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir)
return False
else:
logger.info("Fetching from %s to '%s'", url, output_dir)

file_name, archive_extension = get_filename_extension_from_url(url)
request_data = requests.get(url, proxies=proxies, timeout=timeout)

if archive_extension == "zip":
zip_archive = zipfile.ZipFile(io.BytesIO(request_data.content))
zip_archive.extractall(output_dir)
elif archive_extension == "gz" and not "tar.gz" in url:
gzip_archive = gzip.GzipFile(fileobj=io.BytesIO(request_data.content))
file_content = gzip_archive.read()
with open(f"{output_dir}/{file_name}", "wb") as file:
file.write(file_content)
else:
logger.warning(
"Skipped url %s as file type is not supported here. "
"See haystack documentation for support of more file types",
url,
)

return True


def is_whisper_available():
return importlib.util.find_spec("whisper") is not None
4 changes: 4 additions & 0 deletions releasenotes/notes/safe-fetch-4ba829def3241eec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add previously removed `fetch_archive_from_http` util function to fetch zip and gzip archives from url

0 comments on commit acecc40

Please sign in to comment.