Merge pull request #413 from adobe/botsupdate

trieloff · web-flow · commit f4383bad25ce · 2024-11-11T15:00:27.000-08:00
feat(bots): add more bots, based on 2024-10-31 data
diff --git a/src/bots.mjs b/src/bots.mjs
@@ -15,7 +15,7 @@
 // | select $d.request.user_agent as ua, $d.rum.user_agent as ua2
 // | filter $d.ua2 == 'bot'
 // | groupby ua agg count() as count
-// | orderby count desc;
+// | sort by count desc
 export const bots = {
   Ads: [
     {
@@ -39,6 +39,10 @@ export const bots = {
       user_agent: 'AmazonProductDiscovery/1.0',
       regex: 'AmazonProductDiscovery',
     },
+    {
+      user_agent: 'Storebot-Google/1.0',
+      regex: 'Storebot-Google',
+    },
   ],
   Quality: [
     {
@@ -57,9 +61,16 @@ export const bots = {
       user_agent: 'OSVCLinkChecker/1.0',
       regex: 'OSVCLinkChecker',
     },
+    {
+      user_agent: 'TagInspector/500.1',
+      regex: 'TagInspector',
+    },
+    {
+      user_agent: 'SiteCheck-sitecrawl by Siteimprove.com',
+      regex: 'Siteimprove',
+    },
   ],
   Monitoring: [
-
     {
       user_agent: 'DatadogSynthetics',
       regex: 'DatadogSynthetics',
@@ -100,12 +111,52 @@ export const bots = {
       user_agent: 'RuxitSynthetic various versions',
       regex: 'RuxitSynthetic',
     },
+    {
+      user_agent: 'StatusCake_Pagespeed_Indev',
+      regex: 'StatusCake_Pagespeed',
+    },
+    {
+      user_agent: 'One.Shop New Relic Synthetics',
+      regex: 'New Relic Synthetics',
+    },
+    {
+      user_agent: 'Splunk Synthetics',
+      regex: 'Splunk Synthetics',
+    },
+    {
+      user_agent: 'VisualMonitoring/0.1',
+      regex: 'VisualMonitoring',
+    },
+    {
+      user_agent: 'ERAMonitor',
+      regex: 'ERAMonitor',
+    },
+    {
+      user_agent: 'WatchMouse',
+      regex: 'watchmouse.com',
+    },
+    {
+      user_agent: 'Elastic/Synthetics',
+      regex: 'Elastic/Synthetics',
+    },
   ],
   Social: [
     {
       user_agent: 'facebookexternalhit/1.1',
       regex: 'facebookexternalhit',
     },
+    {
+      user_agent: 'Pinterestbot/1.0',
+      regex: 'Pinterestbot',
+    },
+    {
+      user_agent: 'Slackbot-LinkExpanding',
+      regex: 'Slackbot-LinkExpanding',
+    },
+    {
+      user_agent: 'Iframely/1.3.1',
+      regex: 'Iframely',
+    },
   ],
   SEO: [
     {
@@ -133,6 +184,54 @@ export const bots = {
       regex: 'https://deepcrawl.com/bot',
     },
   ],
+  // There is some overlap between AI and Search, and some companies like Apple,
+  // Google, and Meta have many bots that are used for different purposes.
+  // the repo https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.json
+  // has a longer list of AI bots, but many of them don't have enough traffic
+  // to be worth adding here. In addition, the repo seems to err on the side of
+  // classifying crawlers as AI bots.
+  AI: [
+    {
+      user_agent: 'Ai2Bot-Dolma',
+      regex: 'Ai2Bot-Dolma',
+    },
+    {
+      user_agent: 'GPTBot',
+      regex: 'GPTBot',
+    },
+    {
+      user_agent: 'Claude-Web',
+      regex: 'Claude-Web',
+    },
+    {
+      user_agent: 'anthropic-ai',
+      regex: 'anthropic-ai',
+    },
+    {
+      user_agent: 'Google-Extended',
+      regex: 'Google-Extended',
+    },
+    {
+      user_agent: 'FacebookBot',
+      regex: 'FacebookBot',
+    },
+    {
+      user_agent: 'Applebot-Extended',
+      regex: 'Applebot.*Extended',
+    },
+    {
+      user_agent: 'Meta-ExternalAgent',
+      regex: 'Meta-ExternalAgent',
+    },
+    {
+      user_agent: 'PerplexityBot',
+      regex: 'PerplexityBot',
+    },
+    {
+      user_agent: 'YouBot',
+      regex: 'YouBot',
+    },
+  ],
   Search: [
     {
       user_agent: 'CoveoBot/2.0',
@@ -144,7 +243,7 @@ export const bots = {
     },
     {
       user_agent: 'YandexRenderResourcesBot/1.0',
-      regex: 'YandexRenderResourcesBot',
+      regex: 'Yandex',
     },
     {
       user_agent: 'MS-Search Crawler',
@@ -158,11 +257,85 @@ export const bots = {
       user_agent: 'Googlebot/2.1',
       regex: 'Googlebot',
     },
+    {
+      user_agent: 'Applebot/0.1',
+      regex: 'Applebot',
+    },
+    {
+      user_agent: 'AddSearchBot/1.0',
+      regex: 'AddSearchBot',
+    },
+    {
+      user_agent: 'ClarityBot/9.0',
+      regex: 'ClarityBot',
+    },
+    {
+      user_agent: 'PetalBot',
+      regex: 'PetalBot',
+    },
   ],
   Security: [
     {
       user_agent: 'Popetech-Scanbot/1.0',
       regex: 'Popetech-Scanbot',
     },
+    {
+      user_agent: 'Detectify',
+      regex: 'Detectify',
+    },
+    {
+      user_agent: 'Probely',
+      regex: 'Probely',
+    },
+    {
+      user_agent: 'VirusTotalBot',
+      regex: 'VirusTotalBot',
+    },
+
+  ],
+  Compliance: [
+    {
+      user_agent: 'Cookiebot/1.0',
+      regex: 'Cookiebot',
+    },
+    {
+      user_agent: 'OneTrustBot',
+      regex: 'OneTrustBot',
+    },
+    {
+      user_agent: 'CookieReports.com',
+      regex: 'CookieReports.com',
+    },
+    {
+      user_agent: 'ActiveComply',
+      regex: 'ActiveComply',
+    },
+  ],
+  Archive: [
+    {
+      user_agent: 'PageFreezer',
+      regex: 'PageFreezer',
+    },
+    {
+      user_agent: 'mirrorweb.com',
+      regex: 'mirrorweb.com',
+    },
+    {
+      // CCBot is a bot that crawls the web to find content for the Common Crawl project
+      // common crawl is used to train many AI datasets, but the bot is not an AI crawler
+      // per se, so it's not included in the AI category
+      user_agent: 'CCBot',
+      regex: 'CCBot',
+    },
+  ],
+  Translation: [
+    {
+      user_agent: 'TransSync',
+      regex: 'TransSync.*motionpoint',
+    },
+    {
+      user_agent: 'WovnCrawler',
+      regex: 'WovnCrawler',
+    },
   ],
 };