Skip to content

Commit 530bc7a

Browse files
authored
Merge branch 'master' into refactor/remove-mixins-less
2 parents a2bdb26 + 5876ccb commit 530bc7a

File tree

36 files changed

+1017
-234
lines changed

36 files changed

+1017
-234
lines changed

.github/ISSUE_TEMPLATE/bug_report.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ body:
2020
label: Reproducing the bug
2121
description: Being as specific as possible, what steps result in triggering this bug?
2222
value: |
23+
2324
1. Go to ...
2425
2. Do ...
2526
@@ -61,4 +62,4 @@ body:
6162
- type: markdown
6263
attributes:
6364
value: |
64-
Thanks for taking the time to fill out this bug report! 👍
65+
Thanks for taking the time to fill out this bug report! 👍

compose.production.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ services:
2727
restart: unless-stopped
2828
hostname: "$HOSTNAME"
2929
environment:
30-
- GUNICORN_OPTS= --workers 1 --timeout 300 --max-requests 500
30+
- GUNICORN_OPTS= --workers 2 --timeout 300 --max-requests 500
3131
- OL_CONFIG=/olsystem/etc/openlibrary.yml
3232
volumes:
3333
- ../booklending_utils:/booklending_utils

compose.staging.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ services:
3838
restart: unless-stopped
3939
hostname: "$HOSTNAME"
4040
environment:
41-
- GUNICORN_OPTS= --workers 1 --timeout 180 --max-requests 500
41+
- GUNICORN_OPTS= --workers 2 --timeout 180 --max-requests 500
4242
- OL_CONFIG=/olsystem/etc/openlibrary.yml
4343
volumes:
4444
- ../olsystem:/olsystem

compose.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ services:
2020
image: "${OLIMAGE:-oldev:latest}"
2121
environment:
2222
- OL_CONFIG=${OL_CONFIG:-/openlibrary/conf/openlibrary.yml}
23-
- GUNICORN_OPTS=${GUNICORN_OPTS:- --reload --workers 1 --timeout 180 --max-requests 500}
23+
- GUNICORN_OPTS=${GUNICORN_OPTS:- --reload --workers 2 --timeout 180 --max-requests 500}
2424
command: docker/ol-web-fastapi-start.sh
2525
ports:
2626
- ${FAST_WEB_PORT:-18080}:8080

docker/nginx.conf

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ http {
3232
include /olsystem/etc/nginx/logging.conf;
3333
access_log /var/log/nginx/access.log iacombined;
3434

35+
js_shared_dict_zone zone=crawler_ips:10M type=number timeout=300s;
36+
js_import /olsystem/etc/nginx/tagger.js;
37+
3538
client_max_body_size 50m;
3639

3740
sendfile on;
@@ -52,20 +55,42 @@ http {
5255
# These rules only do anything if invoked, e.g., in web_nginx.conf.
5356
# TLDR: these rules can be disabled in `docker/web_nginx.conf`
5457
# and `docker/covers_nginx.conf`.
55-
geo $should_apply_limit {
56-
# No rate limit when IP obfuscation is not applied, as every IP is 255.0.0.0.
57-
255.0.0.0 0;
58-
# In cluster traffic
59-
207.241.224.0/20 0;
60-
# All other traffic
61-
default 1;
58+
geo $is_blessed_ip {
59+
255.0.0.0 1; # Internal
60+
207.241.224.0/20 1; # In cluster traffic
61+
default 0; # All other traffic
6262
}
6363

64-
map $should_apply_limit $rate_limit_key {
65-
0 '';
66-
1 $binary_remote_addr;
64+
# Provides $is_blessed_ua
65+
include /olsystem/etc/nginx/is_blessed_ua.map;
66+
67+
map "$is_blessed_ip:$is_blessed_ua" $rate_limit_key {
68+
"0:0" $binary_remote_addr; # Rate-limit by IP
69+
default ''; # Don't rate-limit
6770
}
6871

72+
# check if user-agent provides a means of identification
73+
map $http_user_agent $is_identifying_ua {
74+
default 0;
75+
"~*bot" 1;
76+
"~*spider" 1;
77+
"~*crawl" 1;
78+
"~*google" 1; # sometimes just GoogleOther
79+
"~*http" 1; # Includes url
80+
"~*@" 1; # Includes email
81+
}
82+
83+
js_set $has_hit_crawler_links tagger.check;
84+
85+
# The only crawlers we want to limit are the ones that don't identify themselves as such
86+
map "$is_blessed_ip:$is_identifying_ua:$has_hit_crawler_links" $global_nonidentifying_crawler_rate_limit_key {
87+
default ''; # No shared rate limiting
88+
"0:0:1" '1'; # Shared rate limit
89+
}
90+
91+
# Limit the crawlers that scrape links but don't ID themselves globally
92+
limit_req_zone $global_nonidentifying_crawler_rate_limit_key zone=global_crawler_limit:5m rate=15r/s;
93+
6994
# Matches other sites
7095
limit_req_zone $rate_limit_key zone=web_limit:10m rate=1r/s;
7196
# Higher rate for APIs since they are cheaper and we often hit them

docker/web_nginx.conf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,12 @@ server {
105105

106106
location / {
107107
limit_req zone=web_limit burst=100 delay=10;
108+
limit_req zone=global_crawler_limit nodelay;
108109
limit_req_status 429;
109110

111+
js_set $is_crawler_link tagger.tag_crawler;
112+
add_header X-SPS $is_crawler_link; # Need to reference the variable for the js method to be executed
113+
110114
# For returning 200 when someone tries to randomly sort author results.
111115
if ($is_sus_random_sort) {
112116
return 200;
@@ -117,7 +121,7 @@ server {
117121
}
118122

119123
if ($is_sus_referer) {
120-
return 444;
124+
return 403;
121125
}
122126

123127
# Haproxy to better handle load/traffic
@@ -138,7 +142,7 @@ server {
138142
limit_req_status 429;
139143

140144
if ($http_user_agent ~* (bytespider|meta-externalagent) ) {
141-
return 444;
145+
return 403;
142146
}
143147

144148
# Haproxy to better handle load/traffic

openlibrary/asgi_app.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import re
6+
import sys
67
from pathlib import Path
78

89
import yaml
@@ -114,28 +115,29 @@ async def i18n_middleware(request: Request, call_next):
114115

115116

116117
def create_app() -> FastAPI:
117-
_setup_env()
118+
if "pytest" not in sys.modules:
119+
_setup_env()
118120

119-
if os.environ.get("CI"):
120-
import pytest
121+
if os.environ.get("CI"):
122+
import pytest
121123

122-
pytest.skip("Skipping in CI", allow_module_level=True)
124+
pytest.skip("Skipping in CI", allow_module_level=True)
123125

124-
ol_config_path = Path(__file__).parent / "conf" / "openlibrary.yml"
125-
ol_config = os.environ.get("OL_CONFIG", str(ol_config_path))
126-
try:
127-
# We still call this even though we don't use it because of the side effects
128-
legacy_wsgi = _load_legacy_wsgi(ol_config) # noqa: F841
126+
ol_config_path = Path(__file__).parent / "conf" / "openlibrary.yml"
127+
ol_config = os.environ.get("OL_CONFIG", str(ol_config_path))
128+
try:
129+
# We still call this even though we don't use it because of the side effects
130+
legacy_wsgi = _load_legacy_wsgi(ol_config) # noqa: F841
129131

130-
global sentry
131-
if sentry is not None:
132-
return
133-
sentry = init_sentry(getattr(infogami.config, 'sentry', {}))
134-
set_tag("fastapi", True)
132+
global sentry
133+
if sentry is not None:
134+
return
135+
sentry = init_sentry(getattr(infogami.config, 'sentry', {}))
136+
set_tag("fastapi", True)
135137

136-
except Exception:
137-
logger.exception("Failed to initialize legacy WSGI app")
138-
raise
138+
except Exception:
139+
logger.exception("Failed to initialize legacy WSGI app")
140+
raise
139141

140142
app = FastAPI(title="OpenLibrary ASGI", version="0.0.1")
141143

@@ -154,6 +156,13 @@ def create_app() -> FastAPI:
154156

155157
setup_i18n(app)
156158

159+
@app.middleware("http")
160+
async def add_fastapi_header(request: Request, call_next):
161+
"""Middleware to add a header indicating the response came from FastAPI."""
162+
response = await call_next(request)
163+
response.headers["X-Served-By"] = "FastAPI"
164+
return response
165+
157166
# --- Fast routes (mounted within this app) ---
158167
@app.get("/health")
159168
def health() -> dict[str, str]:

openlibrary/book_providers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ def get_acquisitions(
381381
access=access,
382382
format='web',
383383
price=None,
384-
url=f'https://archive.org/details/{self.get_best_identifier(db_edition or ed_or_solr)}',
384+
url=f'https://archive.org/details/{self.get_best_identifier(db_edition or ed_or_solr)}?view=theater&wrapper=false',
385385
provider_name=self.short_name,
386386
)
387387
]

openlibrary/core/cache.py

Lines changed: 1 addition & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
"MemcacheCache",
2626
"MemoryCache",
2727
"RequestCache",
28-
"cached_property",
2928
"get_memcache",
3029
"memcache_memoize",
3130
"memoize",
@@ -241,28 +240,6 @@ def memcache_get(self, args: tuple, kw: dict) -> tuple[T, float] | None:
241240
####
242241

243242

244-
def cached_property(getter):
245-
"""Decorator like `property`, but the value is computed on first call and cached.
246-
247-
class Foo:
248-
249-
@cached_property
250-
def memcache_client(self):
251-
...
252-
"""
253-
name = getter.__name__
254-
255-
def g(self):
256-
if name in self.__dict__:
257-
return self.__dict__[name]
258-
259-
value = getter(self)
260-
self.__dict__[name] = value
261-
return value
262-
263-
return property(g)
264-
265-
266243
class Cache:
267244
"""Cache interface."""
268245

@@ -322,7 +299,7 @@ class MemcacheCache(Cache):
322299
Expects that the memcache servers are specified in web.config.memcache_servers.
323300
"""
324301

325-
@cached_property
302+
@functools.cached_property
326303
def memcache(self):
327304
if servers := config.get("memcache_servers", None):
328305
return olmemcache.Client(servers)

openlibrary/core/ia.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,48 @@ def get_api_response(url: str, params: dict | None = None) -> dict:
4242
return api_response
4343

4444

45+
def save_page_now(
46+
url: str, access_key: str | None = None, secret_key: str | None = None
47+
) -> str:
48+
"""Archive a URL using the Internet Archive Save Page Now API.
49+
50+
Returns job_id on success, or an error string like "NO_CREDENTIALS",
51+
"ERROR_HTTP_<code>", or "ERROR_EXCEPTION_<msg>" on failure.
52+
"""
53+
if not access_key or not secret_key:
54+
access_key, secret_key = get_ia_s3_keys()
55+
56+
if not access_key or not secret_key:
57+
return "NO_CREDENTIALS"
58+
59+
headers = {
60+
"Authorization": f"LOW {access_key}:{secret_key}",
61+
"Accept": "application/json",
62+
}
63+
data = {"url": url}
64+
65+
try:
66+
r = session.post(
67+
"https://web.archive.org/save", headers=headers, data=data, timeout=30
68+
)
69+
if r.status_code == 200:
70+
try:
71+
result = r.json()
72+
except ValueError:
73+
return f"ERROR_NO_JOB_ID_{r.status_code}"
74+
return result.get('job_id', f"ERROR_NO_JOB_ID_{r.status_code}")
75+
else:
76+
return f"ERROR_HTTP_{r.status_code}"
77+
except (httpx.RequestException, ValueError) as e:
78+
return f"ERROR_EXCEPTION_{str(e)[:50]}"
79+
80+
81+
def get_ia_s3_keys() -> tuple[str | None, str | None]:
82+
"""Resolve IA S3 creds via infogami config."""
83+
spn_config = config.get("ol_spn_api_s3", {})
84+
return spn_config.get("s3_key"), spn_config.get("s3_secret")
85+
86+
4587
def get_metadata_direct(
4688
itemid: str, only_metadata: bool = True, cache: bool = True
4789
) -> dict:

0 commit comments

Comments
 (0)