Merge pull request #264 from bellingcat/minor_fixes

pjrobertson · web-flow · commit 01516724d3dc · 2025-03-21T10:49:39.000Z
Minor fixes
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [project]
 name = "auto-archiver"
-version = "0.13.7"
+version = "0.13.8"
 description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
 
 requires-python = ">=3.10,<3.13"
diff --git a/src/auto_archiver/core/base_module.py b/src/auto_archiver/core/base_module.py
@@ -71,7 +71,16 @@ def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
         :param site: the domain of the site to get authentication information for
         :param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
 
-        :returns: authdict dict of login information for the given site
+        :returns: authdict dict -> {
+            "username": str,
+            "password": str,
+            "api_key": str,
+            "api_secret": str,
+            "cookie": str,
+            "cookies_file": str,
+            "cookies_from_browser": str,
+            "cookies_jar": CookieJar
+        }
 
         **Global options:**\n
         * cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
@@ -85,6 +94,7 @@ def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
         * cookie: str - a cookie string to use for login (specific to this site)\n
         * cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
         * cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
+
         """
         # TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
         # for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?
diff --git a/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py b/src/auto_archiver/modules/instagram_extractor/instagram_extractor.py
@@ -29,6 +29,9 @@ class InstagramExtractor(Extractor):
     # TODO: links to stories
 
     def setup(self) -> None:
+        logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
+        logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
+
         self.insta = instaloader.Instaloader(
             download_geotags=True,
             download_comments=True,
diff --git a/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py b/src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py
@@ -19,12 +19,21 @@ def __init__(self, webdriver_factory=None):
     def enrich(self, to_enrich: Metadata) -> None:
         url = to_enrich.get_url()
 
-        if UrlUtil.is_auth_wall(url):
-            logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
-            return
-
         logger.debug(f"Enriching screenshot for {url=}")
         auth = self.auth_for_site(url)
+
+        # screenshot enricher only supports cookie-type auth (selenium)
+        has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
+
+        if UrlUtil.is_auth_wall(url) and not has_valid_auth:
+            logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
+            if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
+                logger.warning(
+                    f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
+                               Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
+                )
+            return
+
         with self.webdriver_factory(
             self.width,
             self.height,
diff --git a/src/auto_archiver/utils/url.py b/src/auto_archiver/utils/url.py
@@ -4,8 +4,8 @@
 
 
 AUTHWALL_URLS = [
-    re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"),  # telegram private channels
-    re.compile(r"https:\/\/www\.instagram\.com"),  # instagram
+    re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"),  # telegram private channels
+    re.compile(r"https?:\/\/(www\.)?instagram\.com"),  # instagram
 ]
 
 
@@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool:
     """
     clean_url = remove_get_parameters(url)
 
-    # favicons
-    if "favicon" in url:
-        return False
-    # ifnore icons
-    if clean_url.endswith(".ico"):
-        return False
-    # ignore SVGs
-    if remove_get_parameters(url).endswith(".svg"):
-        return False
-
-    # twitter profile pictures
-    if "twimg.com/profile_images" in url:
-        return False
-    if "twimg.com" in url and "/default_profile_images" in url:
-        return False
-
-    # instagram profile pictures
-    if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
-        return False
-    # instagram recurring images
-    if "https://static.cdninstagram.com/rsrc.php/" in url:
-        return False
-
-    # telegram
-    if "https://telegram.org/img/emoji/" in url:
-        return False
-
-    # youtube
-    if "https://www.youtube.com/s/gaming/emoji/" in url:
-        return False
-    if "https://yt3.ggpht.com" in url and "default-user=" in url:
-        return False
-    if "https://www.youtube.com/s/search/audio/" in url:
-        return False
-
-    # ok
-    if " https://ok.ru/res/i/" in url:
-        return False
-
-    # vk
-    if "https://vk.com/emoji/" in url:
-        return False
-    if "vk.com/images/" in url:
-        return False
-    if "vk.com/images/reaction/" in url:
-        return False
-
-    # wikipedia
-    if "wikipedia.org/static" in url:
-        return False
+    IRRELEVANT_URLS = [
+        # favicons
+        ("favicon",),
+        # twitter profile pictures
+        ("twimg.com/profile_images",),
+        ("twimg.com", "default_profile_images"),
+        # instagram profile pictures
+        ("https://scontent.cdninstagram.com/", "150x150"),
+        # instagram recurring images
+        ("https://static.cdninstagram.com/rsrc.php/",),
+        # telegram
+        ("https://telegram.org/img/emoji/",),
+        # youtube
+        ("https://www.youtube.com/s/gaming/emoji/",),
+        ("https://yt3.ggpht.com", "default-user="),
+        ("https://www.youtube.com/s/search/audio/",),
+        # ok
+        ("https://ok.ru/res/i/",),
+        ("https://vk.com/emoji/",),
+        ("vk.com/images/",),
+        ("vk.com/images/reaction/",),
+        # wikipedia
+        ("wikipedia.org/static",),
+    ]
+
+    IRRELEVANT_ENDS_WITH = [
+        ".svg",  # ignore SVGs
+        ".ico",  # ignore icons
+    ]
+
+    for end in IRRELEVANT_ENDS_WITH:
+        if clean_url.endswith(end):
+            return False
+
+    for parts in IRRELEVANT_URLS:
+        if all(part in clean_url for part in parts):
+            return False
 
     return True
 
diff --git a/src/auto_archiver/utils/webdriver.py b/src/auto_archiver/utils/webdriver.py
@@ -22,35 +22,35 @@
 
 class CookieSettingDriver(webdriver.Firefox):
     facebook_accept_cookies: bool
-    cookies: str
-    cookiejar: MozillaCookieJar
+    cookie: str
+    cookie_jar: MozillaCookieJar
 
-    def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
+    def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
         if os.environ.get("RUNNING_IN_DOCKER"):
             # Selenium doesn't support linux-aarch64 driver, we need to set this manually
             kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
 
         super(CookieSettingDriver, self).__init__(*args, **kwargs)
-        self.cookies = cookies
-        self.cookiejar = cookiejar
+        self.cookie = cookie
+        self.cookie_jar = cookie_jar
         self.facebook_accept_cookies = facebook_accept_cookies
 
     def get(self, url: str):
-        if self.cookies or self.cookiejar:
+        if self.cookie_jar or self.cookie:
             # set up the driver to make it not 'cookie averse' (needs a context/URL)
             # get the 'robots.txt' file which should be quick and easy
             robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
             super(CookieSettingDriver, self).get(robots_url)
 
-            if self.cookies:
+            if self.cookie:
                 # an explicit cookie is set for this site, use that first
                 for cookie in self.cookies.split(";"):
                     for name, value in cookie.split("="):
                         self.driver.add_cookie({"name": name, "value": value})
-            elif self.cookiejar:
-                domain = urlparse(url).netloc
+            elif self.cookie_jar:
+                domain = urlparse(url).netloc.removeprefix("www.")
                 regex = re.compile(f"(www)?.?{domain}$")
-                for cookie in self.cookiejar:
+                for cookie in self.cookie_jar:
                     if regex.match(cookie.domain):
                         try:
                             self.add_cookie(
@@ -145,8 +145,8 @@ def __enter__(self) -> webdriver:
 
         try:
             self.driver = CookieSettingDriver(
-                cookies=self.auth.get("cookies"),
-                cookiejar=self.auth.get("cookies_jar"),
+                cookie=self.auth.get("cookie"),
+                cookie_jar=self.auth.get("cookies_jar"),
                 facebook_accept_cookies=self.facebook_accept_cookies,
                 options=options,
             )
diff --git a/tests/enrichers/test_screenshot_enricher.py b/tests/enrichers/test_screenshot_enricher.py
@@ -85,8 +85,8 @@ def test_enrich_adds_screenshot(
     mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
     screenshot_enricher.enrich(metadata_with_video)
     mock_driver_class.assert_called_once_with(
-        cookies=None,
-        cookiejar=None,
+        cookie=None,
+        cookie_jar=None,
         facebook_accept_cookies=False,
         options=mock_options_instance,
     )
@@ -124,6 +124,38 @@ def test_enrich_auth_wall(
         assert metadata_with_video.media[1].properties.get("id") == "screenshot"
 
 
+def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
+    with caplog.at_level("WARNING"):
+        screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
+    assert "[SKIP] SCREENSHOT since url" in caplog.text
+
+
+@pytest.mark.parametrize(
+    "auth",
+    [
+        {"cookie": "cookie"},
+        {"cookies_jar": "cookie"},
+    ],
+)
+def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
+    mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
+
+    # patch the authentication dict:
+    screenshot_enricher.authentication = {"example.com": auth}
+    with caplog.at_level("WARNING"):
+        screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
+    assert "[SKIP] SCREENSHOT since url" not in caplog.text
+
+
+def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
+    mock_driver, mock_driver_class, _ = mock_selenium_env
+    mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
+    screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
+    with caplog.at_level("WARNING"):
+        screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
+    assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
+
+
 def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
     mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
 
diff --git a/tests/utils/test_urls.py b/tests/utils/test_urls.py
@@ -0,0 +1,113 @@
+import pytest
+from auto_archiver.utils.url import (
+    is_auth_wall,
+    check_url_or_raise,
+    domain_for_url,
+    is_relevant_url,
+    remove_get_parameters,
+    twitter_best_quality_url,
+)
+
+
+@pytest.mark.parametrize(
+    "url, is_auth",
+    [
+        ("https://example.com", False),
+        ("https://t.me/c/abc/123", True),
+        ("https://t.me/not-private/", False),
+        ("https://instagram.com", True),
+        ("https://www.instagram.com", True),
+        ("https://www.instagram.com/p/INVALID", True),
+        ("https://www.instagram.com/p/C4QgLbrIKXG/", True),
+    ],
+)
+def test_is_auth_wall(url, is_auth):
+    assert is_auth_wall(url) == is_auth
+
+
+@pytest.mark.parametrize(
+    "url, raises",
+    [
+        ("http://example.com", False),
+        ("https://example.com", False),
+        ("ftp://example.com", True),
+        ("http://localhost", True),
+        ("http://", True),
+    ],
+)
+def test_check_url_or_raise(url, raises):
+    if raises:
+        with pytest.raises(ValueError):
+            check_url_or_raise(url)
+    else:
+        assert check_url_or_raise(url)
+
+
+@pytest.mark.parametrize(
+    "url, domain",
+    [
+        ("https://example.com", "example.com"),
+        ("https://www.example.com", "www.example.com"),
+        ("https://www.example.com/path", "www.example.com"),
+        ("https://", ""),
+        ("http://localhost", "localhost"),
+    ],
+)
+def test_domain_for_url(url, domain):
+    assert domain_for_url(url) == domain
+
+
+@pytest.mark.parametrize(
+    "url, without_get",
+    [
+        ("https://example.com", "https://example.com"),
+        ("https://example.com?utm_source=example", "https://example.com"),
+        ("https://example.com?utm_source=example&other=1", "https://example.com"),
+        ("https://example.com/something", "https://example.com/something"),
+        ("https://example.com/something?utm_source=example", "https://example.com/something"),
+    ],
+)
+def test_remove_get_parameters(url, without_get):
+    assert remove_get_parameters(url) == without_get
+
+
+@pytest.mark.parametrize(
+    "url, relevant",
+    [
+        ("https://example.com", True),
+        ("https://example.com/favicon.ico", False),
+        ("https://twimg.com/profile_images", False),
+        ("https://twimg.com/something/default_profile_images", False),
+        ("https://scontent.cdninstagram.com/username/150x150.jpg", False),
+        ("https://static.cdninstagram.com/rsrc.php/", False),
+        ("https://telegram.org/img/emoji/", False),
+        ("https://www.youtube.com/s/gaming/emoji/", False),
+        ("https://yt3.ggpht.com/default-user=", False),
+        ("https://www.youtube.com/s/search/audio/", False),
+        ("https://ok.ru/res/i/", False),
+        ("https://vk.com/emoji/", False),
+        ("https://vk.com/images/", False),
+        ("https://vk.com/images/reaction/", False),
+        ("https://wikipedia.org/static", False),
+        ("https://example.com/file.svg", False),
+        ("https://example.com/file.ico", False),
+        ("https://example.com/file.mp4", True),
+        ("https://example.com/150x150.jpg", True),
+        ("https://example.com/rsrc.php/", True),
+        ("https://example.com/img/emoji/", True),
+    ],
+)
+def test_is_relevant_url(url, relevant):
+    assert is_relevant_url(url) == relevant
+
+
+@pytest.mark.parametrize(
+    "url, best_quality",
+    [
+        ("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
+        ("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
+        ("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
+    ],
+)
+def test_twitter_best_quality_url(url, best_quality):
+    assert twitter_best_quality_url(url) == best_quality