3232 include /olsystem/etc/nginx/logging.conf;
3333 access_log /var/log/nginx/access.log iacombined;
3434
35+ js_shared_dict_zone zone =crawler_ips:10M type=number timeout =300s ;
36+ js_import /olsystem/etc/nginx/tagger.js;
37+
3538 client_max_body_size 50m ;
3639
3740 sendfile on;
@@ -52,20 +55,42 @@ http {
5255 # These rules only do anything if invoked, e.g., in web_nginx.conf.
5356 # TLDR: these rules can be disabled in `docker/web_nginx.conf`
5457 # and `docker/covers_nginx.conf`.
55- geo $should_apply_limit {
56- # No rate limit when IP obfuscation is not applied, as every IP is 255.0.0.0.
57- 255.0.0.0 0;
58- # In cluster traffic
59- 207.241.224.0 /20 0;
60- # All other traffic
61- default 1;
58+ geo $is_blessed_ip {
59+ 255.0.0.0 1; # Internal
60+ 207.241.224.0 /20 1; # In cluster traffic
61+ default 0; # All other traffic
6262 }
6363
64- map $should_apply_limit $rate_limit_key {
65- 0 '' ;
66- 1 $binary_remote_addr ;
64+ # Provides $is_blessed_ua
65+ include /olsystem/etc/nginx/is_blessed_ua.map ;
66+
67+ map "$is_blessed_ip:$is_blessed_ua" $rate_limit_key {
68+ "0:0" $binary_remote_addr ; # Rate-limit by IP
69+ default ''; # Don' t rate-limit
6770 }
6871
72+ # check if user-agent provides a means of identification
73+ map $http_user_agent $is_identifying_ua {
74+ default 0;
75+ "~*bot" 1;
76+ "~*spider" 1;
77+ "~*crawl" 1;
78+ "~*google" 1; # sometimes just GoogleOther
79+ "~*http" 1; # Includes url
80+ "~*@" 1; # Includes email
81+ }
82+
83+ js_set $has_hit_crawler_links tagger.check;
84+
85+ # The only crawlers we want to limit are the ones that don't identify themselves as such
86+ map "$is_blessed_ip:$is_identifying_ua:$has_hit_crawler_links" $global_nonidentifying_crawler_rate_limit_key {
87+ default '' ; # No shared rate limiting
88+ "0:0:1" '1' ; # Shared rate limit
89+ }
90+
91+ # Limit the crawlers that scrape links but don't ID themselves globally
92+ limit_req_zone $global_nonidentifying_crawler_rate_limit_key zone =global_crawler_limit:5m rate=15r /s;
93+
6994 # Matches other sites
7095 limit_req_zone $rate_limit_key zone =web_limit:10m rate=1r /s;
7196 # Higher rate for APIs since they are cheaper and we often hit them
0 commit comments