|
5 | 5 |
|
6 | 6 | import asyncio |
7 | 7 | import os |
| 8 | +import re |
8 | 9 | import time |
9 | 10 |
|
10 | 11 | import httpx |
@@ -94,6 +95,66 @@ async def monitor_solr(): |
94 | 95 | graphite_address='graphite.us.archive.org:2004', |
95 | 96 | ) |
96 | 97 |
|
| 98 | +@limit_server(["ol-www0"], scheduler) |
| 99 | +@scheduler.scheduled_job('interval', seconds=60) |
| 100 | +async def monitor_partner_useragents(): |
| 101 | + |
| 102 | + def graphite_safe(s: str) -> str: |
| 103 | + """Normalize a string for safe use as a Graphite metric name.""" |
| 104 | + # Replace dots and spaces with underscores |
| 105 | + s = s.replace('.', '_').replace(' ', '_') |
| 106 | + # Remove or replace unsafe characters |
| 107 | + s = re.sub(r'[^A-Za-z0-9_-]+', '_', s) |
| 108 | + # Collapse multiple underscores |
| 109 | + s = re.sub(r'_+', '_', s) |
| 110 | + # Strip leading/trailing underscores or dots |
| 111 | + return s.strip('._') |
| 112 | + |
| 113 | + def extract_agent_counts(ua_counts, allowed_names=None): |
| 114 | + agent_counts = {} |
| 115 | + for ua in ua_counts.strip().split("\n"): |
| 116 | + count, agent, *_ = ua.strip().split(" ") |
| 117 | + count = int(count) |
| 118 | + agent_name = graphite_safe(agent.split('/')[0]) |
| 119 | + if not allowed_names or agent_name in allowed_names: |
| 120 | + agent_counts[agent_name] = count |
| 121 | + else: |
| 122 | + agent_counts.setdefault('other', 0) |
| 123 | + agent_counts['other'] += count |
| 124 | + return agent_counts |
| 125 | + |
| 126 | + known_names = extract_agent_counts(""" |
| 127 | + 177 Whefi/1.0 (contact@whefi.com) |
| 128 | + 85 Bookhives/1.0 (paulpleela@gmail.com) |
| 129 | + 85 AliyunSecBot/Aliyun (AliyunSecBot@service.alibaba.com) |
| 130 | + 62 BookHub/1.0 (contact@ybookshub.com) |
| 131 | + 58 Bookscovery/1.0 (https://bookscovery.com; info@bookscovery.com) |
| 132 | + 45 BookstoreApp/1.0 (contact@thounkai.com) |
| 133 | + 20 Gleeph/1.0 (contact-openlibrary@gleeph.net) |
| 134 | + 2 Tomeki/1.0 (ankit@yopmail.com , gzip) |
| 135 | + 2 Snipd/1.0 (https://www.snipd.com) contact: company@snipd.com |
| 136 | + 2 OnTrack/1.0 (ashkan.haghighifashi@gmail.com) |
| 137 | + 2 Leaders.org (leaders.org) janakan@leaders.org |
| 138 | + 2 AwarioSmartBot/1.0 (+https://awario.com/bots.html; bots@awario.com) |
| 139 | + 1 ISBNdb (support@isbndb.com) |
| 140 | + """) |
| 141 | + |
| 142 | + recent_uas = bash_run( |
| 143 | + f"""obfi_in_docker obfi_previous_minute | obfi_grep_bots -v | grep " 200 " | grep -Eo '[^"]+@[^"]+' | sort | uniq -c | sort -rn""", |
| 144 | + sources=["../obfi.sh"], |
| 145 | + capture_output=True |
| 146 | + ).stdout |
| 147 | + |
| 148 | + agent_counts = extract_agent_counts(recent_uas, allowed_names=known_names) |
| 149 | + events = [] |
| 150 | + ts = int(time.time()) |
| 151 | + for agent, count in agent_counts.items(): |
| 152 | + events.append(GraphiteEvent( |
| 153 | + path=f'stats.ol.partners.{agent}', |
| 154 | + value=float(count), |
| 155 | + timestamp=ts |
| 156 | + )) |
| 157 | + GraphiteEvent.submit_many(events, 'graphite.us.archive.org:2004') |
97 | 158 |
|
98 | 159 | @limit_server(["ol-www0"], scheduler) |
99 | 160 | @scheduler.scheduled_job('interval', seconds=60) |
|
0 commit comments