|
1 |
| -import logging |
| 1 | +import gzip |
2 | 2 | import importlib
|
3 | 3 | import importlib.util
|
4 |
| -from typing import Optional, Tuple, List |
5 |
| -from urllib.parse import urlparse, unquote |
6 |
| -from os.path import splitext, basename |
| 4 | +import io |
| 5 | +import logging |
| 6 | +import zipfile |
| 7 | +from os.path import basename, splitext |
| 8 | +from pathlib import Path |
| 9 | +from typing import Dict, List, Optional, Tuple, Union |
| 10 | +from urllib.parse import unquote, urlparse |
| 11 | + |
| 12 | +import requests |
7 | 13 |
|
8 | 14 | from haystack.errors import DatasetsError
|
9 | 15 | from haystack.schema import Document
|
10 | 16 |
|
11 |
| - |
12 | 17 | logger = logging.getLogger(__name__)
|
13 | 18 |
|
14 | 19 |
|
@@ -55,5 +60,54 @@ def get_filename_extension_from_url(url: str) -> Tuple[str, str]:
|
55 | 60 | return file_name, archive_extension
|
56 | 61 |
|
57 | 62 |
|
| 63 | +def fetch_archive_from_http( |
| 64 | + url: str, |
| 65 | + output_dir: str, |
| 66 | + proxies: Optional[Dict[str, str]] = None, |
| 67 | + timeout: Union[float, Tuple[float, float]] = 10.0, |
| 68 | +) -> bool: |
| 69 | + """ |
| 70 | + Fetch an archive (zip or gz) from a url via http and extract content to an output directory. |
| 71 | + :param url: http address |
| 72 | + :param output_dir: local path |
| 73 | + :param proxies: proxies details as required by requests library |
| 74 | + :param timeout: How many seconds to wait for the server to send data before giving up, |
| 75 | + as a float, or a :ref:`(connect timeout, read timeout) <timeouts>` tuple. |
| 76 | + Defaults to 10 seconds. |
| 77 | + :return: if anything got fetched |
| 78 | + """ |
| 79 | + # verify & prepare local directory |
| 80 | + path = Path(output_dir) |
| 81 | + if not path.exists(): |
| 82 | + path.mkdir(parents=True) |
| 83 | + |
| 84 | + is_not_empty = len(list(Path(path).rglob("*"))) > 0 |
| 85 | + if is_not_empty: |
| 86 | + logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir) |
| 87 | + return False |
| 88 | + else: |
| 89 | + logger.info("Fetching from %s to '%s'", url, output_dir) |
| 90 | + |
| 91 | + file_name, archive_extension = get_filename_extension_from_url(url) |
| 92 | + request_data = requests.get(url, proxies=proxies, timeout=timeout) |
| 93 | + |
| 94 | + if archive_extension == "zip": |
| 95 | + zip_archive = zipfile.ZipFile(io.BytesIO(request_data.content)) |
| 96 | + zip_archive.extractall(output_dir) |
| 97 | + elif archive_extension == "gz" and not "tar.gz" in url: |
| 98 | + gzip_archive = gzip.GzipFile(fileobj=io.BytesIO(request_data.content)) |
| 99 | + file_content = gzip_archive.read() |
| 100 | + with open(f"{output_dir}/{file_name}", "wb") as file: |
| 101 | + file.write(file_content) |
| 102 | + else: |
| 103 | + logger.warning( |
| 104 | + "Skipped url %s as file type is not supported here. " |
| 105 | + "See haystack documentation for support of more file types", |
| 106 | + url, |
| 107 | + ) |
| 108 | + |
| 109 | + return True |
| 110 | + |
| 111 | + |
58 | 112 | def is_whisper_available():
|
59 | 113 | return importlib.util.find_spec("whisper") is not None
|
0 commit comments