15
15
// | select $d.request.user_agent as ua, $d.rum.user_agent as ua2
16
16
// | filter $d.ua2 == 'bot'
17
17
// | groupby ua agg count() as count
18
- // | orderby count desc;
18
+ // | sort by count desc
19
19
export const bots = {
20
20
Ads : [
21
21
{
@@ -39,6 +39,10 @@ export const bots = {
39
39
user_agent : 'AmazonProductDiscovery/1.0' ,
40
40
regex : 'AmazonProductDiscovery' ,
41
41
} ,
42
+ {
43
+ user_agent : 'Storebot-Google/1.0' ,
44
+ regex : 'Storebot-Google' ,
45
+ } ,
42
46
] ,
43
47
Quality : [
44
48
{
@@ -57,9 +61,16 @@ export const bots = {
57
61
user_agent : 'OSVCLinkChecker/1.0' ,
58
62
regex : 'OSVCLinkChecker' ,
59
63
} ,
64
+ {
65
+ user_agent : 'TagInspector/500.1' ,
66
+ regex : 'TagInspector' ,
67
+ } ,
68
+ {
69
+ user_agent : 'SiteCheck-sitecrawl by Siteimprove.com' ,
70
+ regex : 'Siteimprove' ,
71
+ } ,
60
72
] ,
61
73
Monitoring : [
62
-
63
74
{
64
75
user_agent : 'DatadogSynthetics' ,
65
76
regex : 'DatadogSynthetics' ,
@@ -100,12 +111,52 @@ export const bots = {
100
111
user_agent : 'RuxitSynthetic various versions' ,
101
112
regex : 'RuxitSynthetic' ,
102
113
} ,
114
+ {
115
+ user_agent : 'StatusCake_Pagespeed_Indev' ,
116
+ regex : 'StatusCake_Pagespeed' ,
117
+ } ,
118
+ {
119
+ user_agent : 'One.Shop New Relic Synthetics' ,
120
+ regex : 'New Relic Synthetics' ,
121
+ } ,
122
+ {
123
+ user_agent : 'Splunk Synthetics' ,
124
+ regex : 'Splunk Synthetics' ,
125
+ } ,
126
+ {
127
+ user_agent : 'VisualMonitoring/0.1' ,
128
+ regex : 'VisualMonitoring' ,
129
+ } ,
130
+ {
131
+ user_agent : 'ERAMonitor' ,
132
+ regex : 'ERAMonitor' ,
133
+ } ,
134
+ {
135
+ user_agent : 'WatchMouse' ,
136
+ regex : 'watchmouse.com' ,
137
+ } ,
138
+ {
139
+ user_agent : 'Elastic/Synthetics' ,
140
+ regex : 'Elastic/Synthetics' ,
141
+ } ,
103
142
] ,
104
143
Social : [
105
144
{
106
145
user_agent : 'facebookexternalhit/1.1' ,
107
146
regex : 'facebookexternalhit' ,
108
147
} ,
148
+ {
149
+ user_agent : 'Pinterestbot/1.0' ,
150
+ regex : 'Pinterestbot' ,
151
+ } ,
152
+ {
153
+ user_agent : 'Slackbot-LinkExpanding' ,
154
+ regex : 'Slackbot-LinkExpanding' ,
155
+ } ,
156
+ {
157
+ user_agent : 'Iframely/1.3.1' ,
158
+ regex : 'Iframely' ,
159
+ } ,
109
160
] ,
110
161
SEO : [
111
162
{
@@ -133,6 +184,54 @@ export const bots = {
133
184
regex : 'https://deepcrawl.com/bot' ,
134
185
} ,
135
186
] ,
187
+ // There is some overlap between AI and Search, and some companies like Apple,
188
+ // Google, and Meta have many bots that are used for different purposes.
189
+ // the repo https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.json
190
+ // has a longer list of AI bots, but many of them don't have enough traffic
191
+ // to be worth adding here. In addition, the repo seems to err on the side of
192
+ // classifying crawlers as AI bots.
193
+ AI : [
194
+ {
195
+ user_agent : 'Ai2Bot-Dolma' ,
196
+ regex : 'Ai2Bot-Dolma' ,
197
+ } ,
198
+ {
199
+ user_agent : 'GPTBot' ,
200
+ regex : 'GPTBot' ,
201
+ } ,
202
+ {
203
+ user_agent : 'Claude-Web' ,
204
+ regex : 'Claude-Web' ,
205
+ } ,
206
+ {
207
+ user_agent : 'anthropic-ai' ,
208
+ regex : 'anthropic-ai' ,
209
+ } ,
210
+ {
211
+ user_agent : 'Google-Extended' ,
212
+ regex : 'Google-Extended' ,
213
+ } ,
214
+ {
215
+ user_agent : 'FacebookBot' ,
216
+ regex : 'FacebookBot' ,
217
+ } ,
218
+ {
219
+ user_agent : 'Applebot-Extended' ,
220
+ regex : 'Applebot.*Extended' ,
221
+ } ,
222
+ {
223
+ user_agent : 'Meta-ExternalAgent' ,
224
+ regex : 'Meta-ExternalAgent' ,
225
+ } ,
226
+ {
227
+ user_agent : 'PerplexityBot' ,
228
+ regex : 'PerplexityBot' ,
229
+ } ,
230
+ {
231
+ user_agent : 'YouBot' ,
232
+ regex : 'YouBot' ,
233
+ } ,
234
+ ] ,
136
235
Search : [
137
236
{
138
237
user_agent : 'CoveoBot/2.0' ,
@@ -144,7 +243,7 @@ export const bots = {
144
243
} ,
145
244
{
146
245
user_agent : 'YandexRenderResourcesBot/1.0' ,
147
- regex : 'YandexRenderResourcesBot ' ,
246
+ regex : 'Yandex ' ,
148
247
} ,
149
248
{
150
249
user_agent : 'MS-Search Crawler' ,
@@ -158,11 +257,85 @@ export const bots = {
158
257
user_agent : 'Googlebot/2.1' ,
159
258
regex : 'Googlebot' ,
160
259
} ,
260
+ {
261
+ user_agent : 'Applebot/0.1' ,
262
+ regex : 'Applebot' ,
263
+ } ,
264
+ {
265
+ user_agent : 'AddSearchBot/1.0' ,
266
+ regex : 'AddSearchBot' ,
267
+ } ,
268
+ {
269
+ user_agent : 'ClarityBot/9.0' ,
270
+ regex : 'ClarityBot' ,
271
+ } ,
272
+ {
273
+ user_agent : 'PetalBot' ,
274
+ regex : 'PetalBot' ,
275
+ } ,
161
276
] ,
162
277
Security : [
163
278
{
164
279
user_agent : 'Popetech-Scanbot/1.0' ,
165
280
regex : 'Popetech-Scanbot' ,
166
281
} ,
282
+ {
283
+ user_agent : 'Detectify' ,
284
+ regex : 'Detectify' ,
285
+ } ,
286
+ {
287
+ user_agent : 'Probely' ,
288
+ regex : 'Probely' ,
289
+ } ,
290
+ {
291
+ user_agent : 'VirusTotalBot' ,
292
+ regex : 'VirusTotalBot' ,
293
+ } ,
294
+
295
+ ] ,
296
+ Compliance : [
297
+ {
298
+ user_agent : 'Cookiebot/1.0' ,
299
+ regex : 'Cookiebot' ,
300
+ } ,
301
+ {
302
+ user_agent : 'OneTrustBot' ,
303
+ regex : 'OneTrustBot' ,
304
+ } ,
305
+ {
306
+ user_agent : 'CookieReports.com' ,
307
+ regex : 'CookieReports.com' ,
308
+ } ,
309
+ {
310
+ user_agent : 'ActiveComply' ,
311
+ regex : 'ActiveComply' ,
312
+ } ,
313
+ ] ,
314
+ Archive : [
315
+ {
316
+ user_agent : 'PageFreezer' ,
317
+ regex : 'PageFreezer' ,
318
+ } ,
319
+ {
320
+ user_agent : 'mirrorweb.com' ,
321
+ regex : 'mirrorweb.com' ,
322
+ } ,
323
+ {
324
+ // CCBot is a bot that crawls the web to find content for the Common Crawl project
325
+ // common crawl is used to train many AI datasets, but the bot is not an AI crawler
326
+ // per se, so it's not included in the AI category
327
+ user_agent : 'CCBot' ,
328
+ regex : 'CCBot' ,
329
+ } ,
330
+ ] ,
331
+ Translation : [
332
+ {
333
+ user_agent : 'TransSync' ,
334
+ regex : 'TransSync.*motionpoint' ,
335
+ } ,
336
+ {
337
+ user_agent : 'WovnCrawler' ,
338
+ regex : 'WovnCrawler' ,
339
+ } ,
167
340
] ,
168
341
} ;
0 commit comments