Skip to content

Commit 0151672

Browse files
authored
Merge pull request #264 from bellingcat/minor_fixes
Minor fixes
2 parents 42e16ae + a066bf4 commit 0151672

File tree

8 files changed

+226
-72
lines changed

8 files changed

+226
-72
lines changed

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
44

55
[project]
66
name = "auto-archiver"
7-
version = "0.13.7"
7+
version = "0.13.8"
88
description = "Automatically archive links to videos, images, and social media content from Google Sheets (and more)."
99

1010
requires-python = ">=3.10,<3.13"

src/auto_archiver/core/base_module.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,16 @@ def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
7171
:param site: the domain of the site to get authentication information for
7272
:param extract_cookies: whether or not to extract cookies from the given browser/file and return the cookie jar (disabling can speed up processing if you don't actually need the cookies jar).
7373
74-
:returns: authdict dict of login information for the given site
74+
:returns: authdict dict -> {
75+
"username": str,
76+
"password": str,
77+
"api_key": str,
78+
"api_secret": str,
79+
"cookie": str,
80+
"cookies_file": str,
81+
"cookies_from_browser": str,
82+
"cookies_jar": CookieJar
83+
}
7584
7685
**Global options:**\n
7786
* cookies_from_browser: str - the name of the browser to extract cookies from (e.g. 'chrome', 'firefox' - uses ytdlp under the hood to extract\n
@@ -85,6 +94,7 @@ def auth_for_site(self, site: str, extract_cookies=True) -> Mapping[str, Any]:
8594
* cookie: str - a cookie string to use for login (specific to this site)\n
8695
* cookies_file: str - the path to a cookies file to use for login (specific to this site)\n
8796
* cookies_from_browser: str - the name of the browser to extract cookies from (specitic for this site)\n
97+
8898
"""
8999
# TODO: think about if/how we can deal with sites that have multiple domains (main one is x.com/twitter.com)
90100
# for now the user must enter them both, like "x.com,twitter.com" in their config. Maybe we just hard-code?

src/auto_archiver/modules/instagram_extractor/instagram_extractor.py

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ class InstagramExtractor(Extractor):
2929
# TODO: links to stories
3030

3131
def setup(self) -> None:
32+
logger.warning("Instagram Extractor is not actively maintained, and may not work as expected.")
33+
logger.warning("Please consider using the Instagram Tbot Extractor or Instagram API Extractor instead.")
34+
3235
self.insta = instaloader.Instaloader(
3336
download_geotags=True,
3437
download_comments=True,

src/auto_archiver/modules/screenshot_enricher/screenshot_enricher.py

+13-4
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,21 @@ def __init__(self, webdriver_factory=None):
1919
def enrich(self, to_enrich: Metadata) -> None:
2020
url = to_enrich.get_url()
2121

22-
if UrlUtil.is_auth_wall(url):
23-
logger.debug(f"[SKIP] SCREENSHOT since url is behind AUTH WALL: {url=}")
24-
return
25-
2622
logger.debug(f"Enriching screenshot for {url=}")
2723
auth = self.auth_for_site(url)
24+
25+
# screenshot enricher only supports cookie-type auth (selenium)
26+
has_valid_auth = auth and (auth.get("cookies") or auth.get("cookies_jar") or auth.get("cookie"))
27+
28+
if UrlUtil.is_auth_wall(url) and not has_valid_auth:
29+
logger.warning(f"[SKIP] SCREENSHOT since url is behind AUTH WALL and no login details provided: {url=}")
30+
if any(auth.get(key) for key in ["username", "password", "api_key", "api_secret"]):
31+
logger.warning(
32+
f"Screenshot enricher only supports cookie-type authentication, you have provided {auth.keys()} which are not supported.\
33+
Consider adding 'cookie', 'cookies_file' or 'cookies_from_browser' to your auth for this site."
34+
)
35+
return
36+
2837
with self.webdriver_factory(
2938
self.width,
3039
self.height,

src/auto_archiver/utils/url.py

+39-52
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44

55

66
AUTHWALL_URLS = [
7-
re.compile(r"https:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
8-
re.compile(r"https:\/\/www\.instagram\.com"), # instagram
7+
re.compile(r"https?:\/\/t\.me(\/c)\/(.+)\/(\d+)"), # telegram private channels
8+
re.compile(r"https?:\/\/(www\.)?instagram\.com"), # instagram
99
]
1010

1111

@@ -81,56 +81,43 @@ def is_relevant_url(url: str) -> bool:
8181
"""
8282
clean_url = remove_get_parameters(url)
8383

84-
# favicons
85-
if "favicon" in url:
86-
return False
87-
# ifnore icons
88-
if clean_url.endswith(".ico"):
89-
return False
90-
# ignore SVGs
91-
if remove_get_parameters(url).endswith(".svg"):
92-
return False
93-
94-
# twitter profile pictures
95-
if "twimg.com/profile_images" in url:
96-
return False
97-
if "twimg.com" in url and "/default_profile_images" in url:
98-
return False
99-
100-
# instagram profile pictures
101-
if "https://scontent.cdninstagram.com/" in url and "150x150" in url:
102-
return False
103-
# instagram recurring images
104-
if "https://static.cdninstagram.com/rsrc.php/" in url:
105-
return False
106-
107-
# telegram
108-
if "https://telegram.org/img/emoji/" in url:
109-
return False
110-
111-
# youtube
112-
if "https://www.youtube.com/s/gaming/emoji/" in url:
113-
return False
114-
if "https://yt3.ggpht.com" in url and "default-user=" in url:
115-
return False
116-
if "https://www.youtube.com/s/search/audio/" in url:
117-
return False
118-
119-
# ok
120-
if " https://ok.ru/res/i/" in url:
121-
return False
122-
123-
# vk
124-
if "https://vk.com/emoji/" in url:
125-
return False
126-
if "vk.com/images/" in url:
127-
return False
128-
if "vk.com/images/reaction/" in url:
129-
return False
130-
131-
# wikipedia
132-
if "wikipedia.org/static" in url:
133-
return False
84+
IRRELEVANT_URLS = [
85+
# favicons
86+
("favicon",),
87+
# twitter profile pictures
88+
("twimg.com/profile_images",),
89+
("twimg.com", "default_profile_images"),
90+
# instagram profile pictures
91+
("https://scontent.cdninstagram.com/", "150x150"),
92+
# instagram recurring images
93+
("https://static.cdninstagram.com/rsrc.php/",),
94+
# telegram
95+
("https://telegram.org/img/emoji/",),
96+
# youtube
97+
("https://www.youtube.com/s/gaming/emoji/",),
98+
("https://yt3.ggpht.com", "default-user="),
99+
("https://www.youtube.com/s/search/audio/",),
100+
# ok
101+
("https://ok.ru/res/i/",),
102+
("https://vk.com/emoji/",),
103+
("vk.com/images/",),
104+
("vk.com/images/reaction/",),
105+
# wikipedia
106+
("wikipedia.org/static",),
107+
]
108+
109+
IRRELEVANT_ENDS_WITH = [
110+
".svg", # ignore SVGs
111+
".ico", # ignore icons
112+
]
113+
114+
for end in IRRELEVANT_ENDS_WITH:
115+
if clean_url.endswith(end):
116+
return False
117+
118+
for parts in IRRELEVANT_URLS:
119+
if all(part in clean_url for part in parts):
120+
return False
134121

135122
return True
136123

src/auto_archiver/utils/webdriver.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,35 @@
2222

2323
class CookieSettingDriver(webdriver.Firefox):
2424
facebook_accept_cookies: bool
25-
cookies: str
26-
cookiejar: MozillaCookieJar
25+
cookie: str
26+
cookie_jar: MozillaCookieJar
2727

28-
def __init__(self, cookies, cookiejar, facebook_accept_cookies, *args, **kwargs):
28+
def __init__(self, cookie, cookie_jar, facebook_accept_cookies, *args, **kwargs):
2929
if os.environ.get("RUNNING_IN_DOCKER"):
3030
# Selenium doesn't support linux-aarch64 driver, we need to set this manually
3131
kwargs["service"] = webdriver.FirefoxService(executable_path="/usr/local/bin/geckodriver")
3232

3333
super(CookieSettingDriver, self).__init__(*args, **kwargs)
34-
self.cookies = cookies
35-
self.cookiejar = cookiejar
34+
self.cookie = cookie
35+
self.cookie_jar = cookie_jar
3636
self.facebook_accept_cookies = facebook_accept_cookies
3737

3838
def get(self, url: str):
39-
if self.cookies or self.cookiejar:
39+
if self.cookie_jar or self.cookie:
4040
# set up the driver to make it not 'cookie averse' (needs a context/URL)
4141
# get the 'robots.txt' file which should be quick and easy
4242
robots_url = urlunparse(urlparse(url)._replace(path="/robots.txt", query="", fragment=""))
4343
super(CookieSettingDriver, self).get(robots_url)
4444

45-
if self.cookies:
45+
if self.cookie:
4646
# an explicit cookie is set for this site, use that first
4747
for cookie in self.cookies.split(";"):
4848
for name, value in cookie.split("="):
4949
self.driver.add_cookie({"name": name, "value": value})
50-
elif self.cookiejar:
51-
domain = urlparse(url).netloc
50+
elif self.cookie_jar:
51+
domain = urlparse(url).netloc.removeprefix("www.")
5252
regex = re.compile(f"(www)?.?{domain}$")
53-
for cookie in self.cookiejar:
53+
for cookie in self.cookie_jar:
5454
if regex.match(cookie.domain):
5555
try:
5656
self.add_cookie(
@@ -145,8 +145,8 @@ def __enter__(self) -> webdriver:
145145

146146
try:
147147
self.driver = CookieSettingDriver(
148-
cookies=self.auth.get("cookies"),
149-
cookiejar=self.auth.get("cookies_jar"),
148+
cookie=self.auth.get("cookie"),
149+
cookie_jar=self.auth.get("cookies_jar"),
150150
facebook_accept_cookies=self.facebook_accept_cookies,
151151
options=options,
152152
)

tests/enrichers/test_screenshot_enricher.py

+34-2
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,8 @@ def test_enrich_adds_screenshot(
8585
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
8686
screenshot_enricher.enrich(metadata_with_video)
8787
mock_driver_class.assert_called_once_with(
88-
cookies=None,
89-
cookiejar=None,
88+
cookie=None,
89+
cookie_jar=None,
9090
facebook_accept_cookies=False,
9191
options=mock_options_instance,
9292
)
@@ -124,6 +124,38 @@ def test_enrich_auth_wall(
124124
assert metadata_with_video.media[1].properties.get("id") == "screenshot"
125125

126126

127+
def test_skip_authwall_no_cookies(screenshot_enricher, caplog):
128+
with caplog.at_level("WARNING"):
129+
screenshot_enricher.enrich(Metadata().set_url("https://instagram.com"))
130+
assert "[SKIP] SCREENSHOT since url" in caplog.text
131+
132+
133+
@pytest.mark.parametrize(
134+
"auth",
135+
[
136+
{"cookie": "cookie"},
137+
{"cookies_jar": "cookie"},
138+
],
139+
)
140+
def test_dont_skip_authwall_with_cookies(screenshot_enricher, caplog, mocker, mock_selenium_env, auth):
141+
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
142+
143+
# patch the authentication dict:
144+
screenshot_enricher.authentication = {"example.com": auth}
145+
with caplog.at_level("WARNING"):
146+
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
147+
assert "[SKIP] SCREENSHOT since url" not in caplog.text
148+
149+
150+
def test_show_warning_wrong_auth_type(screenshot_enricher, caplog, mocker, mock_selenium_env):
151+
mock_driver, mock_driver_class, _ = mock_selenium_env
152+
mocker.patch("auto_archiver.utils.url.is_auth_wall", return_value=True)
153+
screenshot_enricher.authentication = {"example.com": {"username": "user", "password": "pass"}}
154+
with caplog.at_level("WARNING"):
155+
screenshot_enricher.enrich(Metadata().set_url("https://example.com"))
156+
assert "Screenshot enricher only supports cookie-type authentication" in caplog.text
157+
158+
127159
def test_handle_timeout_exception(screenshot_enricher, metadata_with_video, mock_selenium_env, mocker):
128160
mock_driver, mock_driver_class, mock_options_instance = mock_selenium_env
129161

tests/utils/test_urls.py

+113
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import pytest
2+
from auto_archiver.utils.url import (
3+
is_auth_wall,
4+
check_url_or_raise,
5+
domain_for_url,
6+
is_relevant_url,
7+
remove_get_parameters,
8+
twitter_best_quality_url,
9+
)
10+
11+
12+
@pytest.mark.parametrize(
13+
"url, is_auth",
14+
[
15+
("https://example.com", False),
16+
("https://t.me/c/abc/123", True),
17+
("https://t.me/not-private/", False),
18+
("https://instagram.com", True),
19+
("https://www.instagram.com", True),
20+
("https://www.instagram.com/p/INVALID", True),
21+
("https://www.instagram.com/p/C4QgLbrIKXG/", True),
22+
],
23+
)
24+
def test_is_auth_wall(url, is_auth):
25+
assert is_auth_wall(url) == is_auth
26+
27+
28+
@pytest.mark.parametrize(
29+
"url, raises",
30+
[
31+
("http://example.com", False),
32+
("https://example.com", False),
33+
("ftp://example.com", True),
34+
("http://localhost", True),
35+
("http://", True),
36+
],
37+
)
38+
def test_check_url_or_raise(url, raises):
39+
if raises:
40+
with pytest.raises(ValueError):
41+
check_url_or_raise(url)
42+
else:
43+
assert check_url_or_raise(url)
44+
45+
46+
@pytest.mark.parametrize(
47+
"url, domain",
48+
[
49+
("https://example.com", "example.com"),
50+
("https://www.example.com", "www.example.com"),
51+
("https://www.example.com/path", "www.example.com"),
52+
("https://", ""),
53+
("http://localhost", "localhost"),
54+
],
55+
)
56+
def test_domain_for_url(url, domain):
57+
assert domain_for_url(url) == domain
58+
59+
60+
@pytest.mark.parametrize(
61+
"url, without_get",
62+
[
63+
("https://example.com", "https://example.com"),
64+
("https://example.com?utm_source=example", "https://example.com"),
65+
("https://example.com?utm_source=example&other=1", "https://example.com"),
66+
("https://example.com/something", "https://example.com/something"),
67+
("https://example.com/something?utm_source=example", "https://example.com/something"),
68+
],
69+
)
70+
def test_remove_get_parameters(url, without_get):
71+
assert remove_get_parameters(url) == without_get
72+
73+
74+
@pytest.mark.parametrize(
75+
"url, relevant",
76+
[
77+
("https://example.com", True),
78+
("https://example.com/favicon.ico", False),
79+
("https://twimg.com/profile_images", False),
80+
("https://twimg.com/something/default_profile_images", False),
81+
("https://scontent.cdninstagram.com/username/150x150.jpg", False),
82+
("https://static.cdninstagram.com/rsrc.php/", False),
83+
("https://telegram.org/img/emoji/", False),
84+
("https://www.youtube.com/s/gaming/emoji/", False),
85+
("https://yt3.ggpht.com/default-user=", False),
86+
("https://www.youtube.com/s/search/audio/", False),
87+
("https://ok.ru/res/i/", False),
88+
("https://vk.com/emoji/", False),
89+
("https://vk.com/images/", False),
90+
("https://vk.com/images/reaction/", False),
91+
("https://wikipedia.org/static", False),
92+
("https://example.com/file.svg", False),
93+
("https://example.com/file.ico", False),
94+
("https://example.com/file.mp4", True),
95+
("https://example.com/150x150.jpg", True),
96+
("https://example.com/rsrc.php/", True),
97+
("https://example.com/img/emoji/", True),
98+
],
99+
)
100+
def test_is_relevant_url(url, relevant):
101+
assert is_relevant_url(url) == relevant
102+
103+
104+
@pytest.mark.parametrize(
105+
"url, best_quality",
106+
[
107+
("https://twitter.com/some_image.jpg?name=small", "https://twitter.com/some_image.jpg?name=orig"),
108+
("https://twitter.com/some_image.jpg", "https://twitter.com/some_image.jpg"),
109+
("https://twitter.com/some_image.jpg?name=orig", "https://twitter.com/some_image.jpg?name=orig"),
110+
],
111+
)
112+
def test_twitter_best_quality_url(url, best_quality):
113+
assert twitter_best_quality_url(url) == best_quality

0 commit comments

Comments
 (0)