Skip to content

Commit f4383ba

Browse files
authored
Merge pull request #413 from adobe/botsupdate
feat(bots): add more bots, based on 2024-10-31 data
2 parents fe5c416 + 293328b commit f4383ba

File tree

1 file changed

+176
-3
lines changed

1 file changed

+176
-3
lines changed

src/bots.mjs

Lines changed: 176 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// | select $d.request.user_agent as ua, $d.rum.user_agent as ua2
1616
// | filter $d.ua2 == 'bot'
1717
// | groupby ua agg count() as count
18-
// | orderby count desc;
18+
// | sort by count desc
1919
export const bots = {
2020
Ads: [
2121
{
@@ -39,6 +39,10 @@ export const bots = {
3939
user_agent: 'AmazonProductDiscovery/1.0',
4040
regex: 'AmazonProductDiscovery',
4141
},
42+
{
43+
user_agent: 'Storebot-Google/1.0',
44+
regex: 'Storebot-Google',
45+
},
4246
],
4347
Quality: [
4448
{
@@ -57,9 +61,16 @@ export const bots = {
5761
user_agent: 'OSVCLinkChecker/1.0',
5862
regex: 'OSVCLinkChecker',
5963
},
64+
{
65+
user_agent: 'TagInspector/500.1',
66+
regex: 'TagInspector',
67+
},
68+
{
69+
user_agent: 'SiteCheck-sitecrawl by Siteimprove.com',
70+
regex: 'Siteimprove',
71+
},
6072
],
6173
Monitoring: [
62-
6374
{
6475
user_agent: 'DatadogSynthetics',
6576
regex: 'DatadogSynthetics',
@@ -100,12 +111,52 @@ export const bots = {
100111
user_agent: 'RuxitSynthetic various versions',
101112
regex: 'RuxitSynthetic',
102113
},
114+
{
115+
user_agent: 'StatusCake_Pagespeed_Indev',
116+
regex: 'StatusCake_Pagespeed',
117+
},
118+
{
119+
user_agent: 'One.Shop New Relic Synthetics',
120+
regex: 'New Relic Synthetics',
121+
},
122+
{
123+
user_agent: 'Splunk Synthetics',
124+
regex: 'Splunk Synthetics',
125+
},
126+
{
127+
user_agent: 'VisualMonitoring/0.1',
128+
regex: 'VisualMonitoring',
129+
},
130+
{
131+
user_agent: 'ERAMonitor',
132+
regex: 'ERAMonitor',
133+
},
134+
{
135+
user_agent: 'WatchMouse',
136+
regex: 'watchmouse.com',
137+
},
138+
{
139+
user_agent: 'Elastic/Synthetics',
140+
regex: 'Elastic/Synthetics',
141+
},
103142
],
104143
Social: [
105144
{
106145
user_agent: 'facebookexternalhit/1.1',
107146
regex: 'facebookexternalhit',
108147
},
148+
{
149+
user_agent: 'Pinterestbot/1.0',
150+
regex: 'Pinterestbot',
151+
},
152+
{
153+
user_agent: 'Slackbot-LinkExpanding',
154+
regex: 'Slackbot-LinkExpanding',
155+
},
156+
{
157+
user_agent: 'Iframely/1.3.1',
158+
regex: 'Iframely',
159+
},
109160
],
110161
SEO: [
111162
{
@@ -133,6 +184,54 @@ export const bots = {
133184
regex: 'https://deepcrawl.com/bot',
134185
},
135186
],
187+
// There is some overlap between AI and Search, and some companies like Apple,
188+
// Google, and Meta have many bots that are used for different purposes.
189+
// the repo https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.json
190+
// has a longer list of AI bots, but many of them don't have enough traffic
191+
// to be worth adding here. In addition, the repo seems to err on the side of
192+
// classifying crawlers as AI bots.
193+
AI: [
194+
{
195+
user_agent: 'Ai2Bot-Dolma',
196+
regex: 'Ai2Bot-Dolma',
197+
},
198+
{
199+
user_agent: 'GPTBot',
200+
regex: 'GPTBot',
201+
},
202+
{
203+
user_agent: 'Claude-Web',
204+
regex: 'Claude-Web',
205+
},
206+
{
207+
user_agent: 'anthropic-ai',
208+
regex: 'anthropic-ai',
209+
},
210+
{
211+
user_agent: 'Google-Extended',
212+
regex: 'Google-Extended',
213+
},
214+
{
215+
user_agent: 'FacebookBot',
216+
regex: 'FacebookBot',
217+
},
218+
{
219+
user_agent: 'Applebot-Extended',
220+
regex: 'Applebot.*Extended',
221+
},
222+
{
223+
user_agent: 'Meta-ExternalAgent',
224+
regex: 'Meta-ExternalAgent',
225+
},
226+
{
227+
user_agent: 'PerplexityBot',
228+
regex: 'PerplexityBot',
229+
},
230+
{
231+
user_agent: 'YouBot',
232+
regex: 'YouBot',
233+
},
234+
],
136235
Search: [
137236
{
138237
user_agent: 'CoveoBot/2.0',
@@ -144,7 +243,7 @@ export const bots = {
144243
},
145244
{
146245
user_agent: 'YandexRenderResourcesBot/1.0',
147-
regex: 'YandexRenderResourcesBot',
246+
regex: 'Yandex',
148247
},
149248
{
150249
user_agent: 'MS-Search Crawler',
@@ -158,11 +257,85 @@ export const bots = {
158257
user_agent: 'Googlebot/2.1',
159258
regex: 'Googlebot',
160259
},
260+
{
261+
user_agent: 'Applebot/0.1',
262+
regex: 'Applebot',
263+
},
264+
{
265+
user_agent: 'AddSearchBot/1.0',
266+
regex: 'AddSearchBot',
267+
},
268+
{
269+
user_agent: 'ClarityBot/9.0',
270+
regex: 'ClarityBot',
271+
},
272+
{
273+
user_agent: 'PetalBot',
274+
regex: 'PetalBot',
275+
},
161276
],
162277
Security: [
163278
{
164279
user_agent: 'Popetech-Scanbot/1.0',
165280
regex: 'Popetech-Scanbot',
166281
},
282+
{
283+
user_agent: 'Detectify',
284+
regex: 'Detectify',
285+
},
286+
{
287+
user_agent: 'Probely',
288+
regex: 'Probely',
289+
},
290+
{
291+
user_agent: 'VirusTotalBot',
292+
regex: 'VirusTotalBot',
293+
},
294+
295+
],
296+
Compliance: [
297+
{
298+
user_agent: 'Cookiebot/1.0',
299+
regex: 'Cookiebot',
300+
},
301+
{
302+
user_agent: 'OneTrustBot',
303+
regex: 'OneTrustBot',
304+
},
305+
{
306+
user_agent: 'CookieReports.com',
307+
regex: 'CookieReports.com',
308+
},
309+
{
310+
user_agent: 'ActiveComply',
311+
regex: 'ActiveComply',
312+
},
313+
],
314+
Archive: [
315+
{
316+
user_agent: 'PageFreezer',
317+
regex: 'PageFreezer',
318+
},
319+
{
320+
user_agent: 'mirrorweb.com',
321+
regex: 'mirrorweb.com',
322+
},
323+
{
324+
// CCBot is a bot that crawls the web to find content for the Common Crawl project
325+
// common crawl is used to train many AI datasets, but the bot is not an AI crawler
326+
// per se, so it's not included in the AI category
327+
user_agent: 'CCBot',
328+
regex: 'CCBot',
329+
},
330+
],
331+
Translation: [
332+
{
333+
user_agent: 'TransSync',
334+
regex: 'TransSync.*motionpoint',
335+
},
336+
{
337+
user_agent: 'WovnCrawler',
338+
regex: 'WovnCrawler',
339+
},
167340
],
168341
};

0 commit comments

Comments
 (0)