Skip to content

Commit 48828d3

Browse files
authored
Merge pull request #11420 from cdrini/experiment/counter-link
Experiment with counter link
2 parents ebfc080 + ddd845b commit 48828d3

File tree

3 files changed

+43
-12
lines changed

3 files changed

+43
-12
lines changed

docker/nginx.conf

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ http {
3232
include /olsystem/etc/nginx/logging.conf;
3333
access_log /var/log/nginx/access.log iacombined;
3434

35+
js_shared_dict_zone zone=crawler_ips:10M type=number timeout=300s;
36+
js_import /olsystem/etc/nginx/tagger.js;
37+
3538
client_max_body_size 50m;
3639

3740
sendfile on;
@@ -52,20 +55,42 @@ http {
5255
# These rules only do anything if invoked, e.g., in web_nginx.conf.
5356
# TLDR: these rules can be disabled in `docker/web_nginx.conf`
5457
# and `docker/covers_nginx.conf`.
55-
geo $should_apply_limit {
56-
# No rate limit when IP obfuscation is not applied, as every IP is 255.0.0.0.
57-
255.0.0.0 0;
58-
# In cluster traffic
59-
207.241.224.0/20 0;
60-
# All other traffic
61-
default 1;
58+
geo $is_blessed_ip {
59+
255.0.0.0 1; # Internal
60+
207.241.224.0/20 1; # In cluster traffic
61+
default 0; # All other traffic
6262
}
6363

64-
map $should_apply_limit $rate_limit_key {
65-
0 '';
66-
1 $binary_remote_addr;
64+
# Provides $is_blessed_ua
65+
include /olsystem/etc/nginx/is_blessed_ua.map;
66+
67+
map "$is_blessed_ip:$is_blessed_ua" $rate_limit_key {
68+
"0:0" $binary_remote_addr; # Rate-limit by IP
69+
default ''; # Don't rate-limit
6770
}
6871

72+
# check if user-agent provides a means of identification
73+
map $http_user_agent $is_identifying_ua {
74+
default 0;
75+
"~*bot" 1;
76+
"~*spider" 1;
77+
"~*crawl" 1;
78+
"~*google" 1; # sometimes just GoogleOther
79+
"~*http" 1; # Includes url
80+
"~*@" 1; # Includes email
81+
}
82+
83+
js_set $has_hit_crawler_links tagger.check;
84+
85+
# The only crawlers we want to limit are the ones that don't identify themselves as such
86+
map "$is_blessed_ip:$is_identifying_ua:$has_hit_crawler_links" $global_nonidentifying_crawler_rate_limit_key {
87+
default ''; # No shared rate limiting
88+
"0:0:1" '1'; # Shared rate limit
89+
}
90+
91+
# Limit the crawlers that scrape links but don't ID themselves globally
92+
limit_req_zone $global_nonidentifying_crawler_rate_limit_key zone=global_crawler_limit:5m rate=15r/s;
93+
6994
# Matches other sites
7095
limit_req_zone $rate_limit_key zone=web_limit:10m rate=1r/s;
7196
# Higher rate for APIs since they are cheaper and we often hit them

docker/web_nginx.conf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,12 @@ server {
105105

106106
location / {
107107
limit_req zone=web_limit burst=100 delay=10;
108+
limit_req zone=global_crawler_limit nodelay;
108109
limit_req_status 429;
109110

111+
js_set $is_crawler_link tagger.tag_crawler;
112+
add_header X-SPS $is_crawler_link; # Need to reference the variable for the js method to be executed
113+
110114
# For returning 200 when someone tries to randomly sort author results.
111115
if ($is_sus_random_sort) {
112116
return 200;
@@ -117,7 +121,7 @@ server {
117121
}
118122

119123
if ($is_sus_referer) {
120-
return 444;
124+
return 403;
121125
}
122126

123127
# Haproxy to better handle load/traffic
@@ -138,7 +142,7 @@ server {
138142
limit_req_status 429;
139143

140144
if ($http_user_agent ~* (bytespider|meta-externalagent) ) {
141-
return 444;
145+
return 403;
142146
}
143147

144148
# Haproxy to better handle load/traffic

openlibrary/templates/lib/nav_foot.html

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ <h2>$_("Help")</h2>
4444
<li><a href="/help/faq/editing" title="$_('Suggest Edits')">$_("Suggesting Edits")</a></li>
4545
<li><a href="/books/add" title="$_('Add a new book to Open Library')">$_("Add a Book")</a></li>
4646
<li><a href="https://github.com/internetarchive/openlibrary/releases" title="$_('Release Notes')">$_("Release Notes")</a></li>
47+
$# detect-missing-i18n-skip-line
48+
<li><a href="$changequery(dict(show_page_status=1))" style="color:transparent;position:absolute;pointer-events:none;" tabindex="-1">Page Status</a></li>
4749
</ul>
4850
<aside>
4951
<a class="footer-icon" title="$_('Twitter')" href="https://twitter.com/OpenLibrary"><img src="/static/images/tweet.svg" alt="" loading="lazy"></a>

0 commit comments

Comments
 (0)