Skip to content

Commit 09f47d8

Browse files
authored
Implement user agent monitoring for Graphite
Added user agent monitoring functionality to track and normalize user agent counts for Graphite metrics.
1 parent 8353b73 commit 09f47d8

File tree

1 file changed

+61
-0
lines changed

1 file changed

+61
-0
lines changed

scripts/monitoring/monitor.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import asyncio
77
import os
8+
import re
89
import time
910

1011
import httpx
@@ -94,6 +95,66 @@ async def monitor_solr():
9495
graphite_address='graphite.us.archive.org:2004',
9596
)
9697

98+
@limit_server(["ol-www0"], scheduler)
99+
@scheduler.scheduled_job('interval', seconds=60)
100+
async def monitor_partner_useragents():
101+
102+
def graphite_safe(s: str) -> str:
103+
"""Normalize a string for safe use as a Graphite metric name."""
104+
# Replace dots and spaces with underscores
105+
s = s.replace('.', '_').replace(' ', '_')
106+
# Remove or replace unsafe characters
107+
s = re.sub(r'[^A-Za-z0-9_-]+', '_', s)
108+
# Collapse multiple underscores
109+
s = re.sub(r'_+', '_', s)
110+
# Strip leading/trailing underscores or dots
111+
return s.strip('._')
112+
113+
def extract_agent_counts(ua_counts, allowed_names=None):
114+
agent_counts = {}
115+
for ua in ua_counts.strip().split("\n"):
116+
count, agent, *_ = ua.strip().split(" ")
117+
count = int(count)
118+
agent_name = graphite_safe(agent.split('/')[0])
119+
if not allowed_names or agent_name in allowed_names:
120+
agent_counts[agent_name] = count
121+
else:
122+
agent_counts.setdefault('other', 0)
123+
agent_counts['other'] += count
124+
return agent_counts
125+
126+
known_names = extract_agent_counts("""
127+
177 Whefi/1.0 (contact@whefi.com)
128+
85 Bookhives/1.0 (paulpleela@gmail.com)
129+
85 AliyunSecBot/Aliyun (AliyunSecBot@service.alibaba.com)
130+
62 BookHub/1.0 (contact@ybookshub.com)
131+
58 Bookscovery/1.0 (https://bookscovery.com; info@bookscovery.com)
132+
45 BookstoreApp/1.0 (contact@thounkai.com)
133+
20 Gleeph/1.0 (contact-openlibrary@gleeph.net)
134+
2 Tomeki/1.0 (ankit@yopmail.com , gzip)
135+
2 Snipd/1.0 (https://www.snipd.com) contact: company@snipd.com
136+
2 OnTrack/1.0 (ashkan.haghighifashi@gmail.com)
137+
2 Leaders.org (leaders.org) janakan@leaders.org
138+
2 AwarioSmartBot/1.0 (+https://awario.com/bots.html; bots@awario.com)
139+
1 ISBNdb (support@isbndb.com)
140+
""")
141+
142+
recent_uas = bash_run(
143+
f"""obfi_in_docker obfi_previous_minute | obfi_grep_bots -v | grep " 200 " | grep -Eo '[^"]+@[^"]+' | sort | uniq -c | sort -rn""",
144+
sources=["../obfi.sh"],
145+
capture_output=True
146+
).stdout
147+
148+
agent_counts = extract_agent_counts(recent_uas, allowed_names=known_names)
149+
events = []
150+
ts = int(time.time())
151+
for agent, count in agent_counts.items():
152+
events.append(GraphiteEvent(
153+
path=f'stats.ol.partners.{agent}',
154+
value=float(count),
155+
timestamp=ts
156+
))
157+
GraphiteEvent.submit_many(events, 'graphite.us.archive.org:2004')
97158

98159
@limit_server(["ol-www0"], scheduler)
99160
@scheduler.scheduled_job('interval', seconds=60)

0 commit comments

Comments
 (0)