From 1d9d4fab39dd16028ac382aa523134871d2439a9 Mon Sep 17 00:00:00 2001
From: Lars Trieloff <lars@trieloff.net>
Date: Thu, 31 Oct 2024 15:50:34 +0100
Subject: [PATCH 1/4] feat(bots): add more bots, based on 2024-10-31 data

this adds two categories of bots: Compliance and Archive. No AI bots yet.
---
 src/bots.mjs | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/bots.mjs b/src/bots.mjs
index 5d70e6b0..70372229 100644
--- a/src/bots.mjs
+++ b/src/bots.mjs
@@ -15,7 +15,6 @@
 // | select $d.request.user_agent as ua, $d.rum.user_agent as ua2
 // | filter $d.ua2 == 'bot'
 // | groupby ua agg count() as count
-// | orderby count desc;
 export const bots = {
   Ads: [
     {
@@ -39,6 +38,10 @@ export const bots = {
       user_agent: 'AmazonProductDiscovery/1.0',
       regex: 'AmazonProductDiscovery',
     },
+    {
+      user_agent: 'Storebot-Google/1.0',
+      regex: 'Storebot-Google',
+    },
   ],
   Quality: [
     {
@@ -57,9 +60,16 @@ export const bots = {
       user_agent: 'OSVCLinkChecker/1.0',
       regex: 'OSVCLinkChecker',
     },
+    {
+      user_agent: 'TagInspector/500.1',
+      regex: 'TagInspector',
+    },
+    {
+      user_agent: 'SiteCheck-sitecrawl by Siteimprove.com',
+      regex: 'Siteimprove',
+    },
   ],
   Monitoring: [
-
     {
       user_agent: 'DatadogSynthetics',
       regex: 'DatadogSynthetics',
@@ -100,12 +110,32 @@ export const bots = {
       user_agent: 'RuxitSynthetic various versions',
       regex: 'RuxitSynthetic',
     },
+    {
+      user_agent: 'StatusCake_Pagespeed_Indev',
+      regex: 'StatusCake_Pagespeed',
+    },
+    {
+      user_agent: 'One.Shop New Relic Synthetics',
+      regex: 'New Relic Synthetics',
+    },
+    {
+      user_agent: 'Splunk Synthetics',
+      regex: 'Splunk Synthetics',
+    },
+    {
+      user_agent: 'VisualMonitoring/0.1',
+      regex: 'VisualMonitoring',
+    },
   ],
   Social: [
     {
       user_agent: 'facebookexternalhit/1.1',
       regex: 'facebookexternalhit',
     },
+    {
+      user_agent: 'Pinterestbot/1.0',
+      regex: 'Pinterestbot',
+    },
   ],
   SEO: [
     {
@@ -158,11 +188,51 @@ export const bots = {
       user_agent: 'Googlebot/2.1',
       regex: 'Googlebot',
     },
+    {
+      user_agent: 'Applebot/0.1',
+      regex: 'Applebot',
+    },
+    {
+      user_agent: 'AddSearchBot/1.0',
+      regex: 'AddSearchBot',
+    },
+    {
+      user_agent: 'ClarityBot/9.0',
+      regex: 'ClarityBot',
+    },
   ],
   Security: [
     {
       user_agent: 'Popetech-Scanbot/1.0',
       regex: 'Popetech-Scanbot',
     },
+    {
+      user_agent: 'Detectify',
+      regex: 'Detectify',
+    },
+    {
+      user_agent: 'Probely',
+      regex: 'Probely',
+    },
+  ],
+  Compliance: [
+    {
+      user_agent: 'Cookiebot/1.0',
+      regex: 'Cookiebot',
+    },
+    {
+      user_agent: 'OneTrustBot',
+      regex: 'OneTrustBot',
+    },
+  ],
+  Archive: [
+    {
+      user_agent: 'PageFreezer',
+      regex: 'PageFreezer',
+    },
+    {
+      user_agent: 'mirrorweb.com',
+      regex: 'mirrorweb.com',
+    },
   ],
 };

From 14949a9d449f3f03658da8d2fe1065a67e2922da Mon Sep 17 00:00:00 2001
From: Lars Trieloff <lars@trieloff.net>
Date: Thu, 31 Oct 2024 16:05:02 +0100
Subject: [PATCH 2/4] feat(bots): add more bots, based on extensive list of
 bots this is based on the longer scan, about 1000 different bots, and adds
 the Translation, Jobs, and AI categories.

---
 src/bots.mjs | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/bots.mjs b/src/bots.mjs
index 70372229..21802918 100644
--- a/src/bots.mjs
+++ b/src/bots.mjs
@@ -126,6 +126,18 @@ export const bots = {
       user_agent: 'VisualMonitoring/0.1',
       regex: 'VisualMonitoring',
     },
+    {
+      user_agent: 'ERAMonitor',
+      regex: 'ERAMonitor',
+    },
+    {
+      user_agent: 'WatchMouse',
+      regex: 'watchmouse.com',
+    },
+    {
+      user_agent: 'Elastic/Synthetics',
+      regex: 'Elastic/Synthetics',
+    },
   ],
   Social: [
     {
@@ -136,6 +148,14 @@ export const bots = {
       user_agent: 'Pinterestbot/1.0',
       regex: 'Pinterestbot',
     },
+    {
+      user_agent: 'Slackbot-LinkExpanding',
+      regex: 'Slackbot-LinkExpanding',
+    },
+    {
+      user_agent: 'Iframely/1.3.1',
+      regex: 'Iframely',
+    },
   ],
   SEO: [
     {
@@ -174,7 +194,7 @@ export const bots = {
     },
     {
       user_agent: 'YandexRenderResourcesBot/1.0',
-      regex: 'YandexRenderResourcesBot',
+      regex: 'Yandex',
     },
     {
       user_agent: 'MS-Search Crawler',
@@ -200,6 +220,10 @@ export const bots = {
       user_agent: 'ClarityBot/9.0',
       regex: 'ClarityBot',
     },
+    {
+      user_agent: 'PetalBot',
+      regex: 'PetalBot',
+    },
   ],
   Security: [
     {
@@ -214,6 +238,11 @@ export const bots = {
       user_agent: 'Probely',
       regex: 'Probely',
     },
+    {
+      user_agent: 'VirusTotalBot',
+      regex: 'VirusTotalBot',
+    },
+
   ],
   Compliance: [
     {
@@ -224,6 +253,14 @@ export const bots = {
       user_agent: 'OneTrustBot',
       regex: 'OneTrustBot',
     },
+    {
+      user_agent: 'CookieReports.com',
+      regex: 'CookieReports.com',
+    },
+    {
+      user_agent: 'ActiveComply',
+      regex: 'ActiveComply',
+    },
   ],
   Archive: [
     {
@@ -235,4 +272,20 @@ export const bots = {
       regex: 'mirrorweb.com',
     },
   ],
+  Translation: [
+    {
+      user_agent: 'TransSync',
+      regex: 'TransSync.*motionpoint',
+    },
+    {
+      user_agent: 'WovnCrawler',
+      regex: 'WovnCrawler',
+    },
+  ],
+  AI: [
+    {
+      user_agent: 'Ai2Bot-Dolma',
+      regex: 'Ai2Bot-Dolma',
+    },
+  ],
 };

From 4b49da55000f08134715b7fc14d6070eaef31eee Mon Sep 17 00:00:00 2001
From: Lars Trieloff <lars@trieloff.net>
Date: Fri, 1 Nov 2024 10:05:35 +0100
Subject: [PATCH 3/4] feat(bots): add more AI bots, based on ai.robots.txt -
 some of these overlap with search - some are archival crawlers also used for
 AI training - some don't have enough traffic to be worth adding here

---
 src/bots.mjs | 61 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/src/bots.mjs b/src/bots.mjs
index 21802918..9067e2a6 100644
--- a/src/bots.mjs
+++ b/src/bots.mjs
@@ -183,6 +183,54 @@ export const bots = {
       regex: 'https://deepcrawl.com/bot',
     },
   ],
+  // There is some overlap between AI and Search, and some companies like Apple,
+  // Google, and Meta have many bots that are used for different purposes.
+  // the repo https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.json
+  // has a longer list of AI bots, but many of them don't have enough traffic
+  // to be worth adding here. In addition, the repo seems to err on the side of
+  // classifying crawlers as AI bots.
+  AI: [
+    {
+      user_agent: 'Ai2Bot-Dolma',
+      regex: 'Ai2Bot-Dolma',
+    },
+    {
+      user_agent: 'GPTBot',
+      regex: 'GPTBot',
+    },
+    {
+      user_agent: 'Claude-Web',
+      regex: 'Claude-Web',
+    },
+    {
+      user_agent: 'anthropic-ai',
+      regex: 'anthropic-ai',
+    },
+    {
+      user_agent: 'Google-Extended',
+      regex: 'Google-Extended',
+    },
+    {
+      user_agent: 'FacebookBot',
+      regex: 'FacebookBot',
+    },
+    {
+      user_agent: 'Applebot-Extended',
+      regex: 'Applebot.*Extended',
+    },
+    {
+      user_agent: 'Meta-ExternalAgent',
+      regex: 'Meta-ExternalAgent',
+    },
+    {
+      user_agent: 'PerplexityBot',
+      regex: 'PerplexityBot',
+    },
+    {
+      user_agent: 'YouBot',
+      regex: 'YouBot',
+    },
+  ],
   Search: [
     {
       user_agent: 'CoveoBot/2.0',
@@ -271,6 +319,13 @@ export const bots = {
       user_agent: 'mirrorweb.com',
       regex: 'mirrorweb.com',
     },
+    {
+      // CCBot is a bot that crawls the web to find content for the Common Crawl project
+      // common crawl is used to train many AI datasets, but the bot is not an AI crawler
+      // per se, so it's not included in the AI category
+      user_agent: 'CCBot',
+      regex: 'CCBot',
+    },
   ],
   Translation: [
     {
@@ -282,10 +337,4 @@ export const bots = {
       regex: 'WovnCrawler',
     },
   ],
-  AI: [
-    {
-      user_agent: 'Ai2Bot-Dolma',
-      regex: 'Ai2Bot-Dolma',
-    },
-  ],
 };

From 293328bfa9f18ee6e9238b53259f1794fb6d3e45 Mon Sep 17 00:00:00 2001
From: Lars Trieloff <lars@trieloff.net>
Date: Mon, 4 Nov 2024 09:39:53 +0100
Subject: [PATCH 4/4] chore(bots): restore deleted comment line

---
 src/bots.mjs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/bots.mjs b/src/bots.mjs
index 9067e2a6..1b5d4bcd 100644
--- a/src/bots.mjs
+++ b/src/bots.mjs
@@ -15,6 +15,7 @@
 // | select $d.request.user_agent as ua, $d.rum.user_agent as ua2
 // | filter $d.ua2 == 'bot'
 // | groupby ua agg count() as count
+// | sort by count desc
 export const bots = {
   Ads: [
     {