From 1d9d4fab39dd16028ac382aa523134871d2439a9 Mon Sep 17 00:00:00 2001 From: Lars Trieloff Date: Thu, 31 Oct 2024 15:50:34 +0100 Subject: [PATCH 1/4] feat(bots): add more bots, based on 2024-10-31 data this adds two categories of bots: Compliance and Archive. No AI bots yet. --- src/bots.mjs | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/src/bots.mjs b/src/bots.mjs index 5d70e6b..7037222 100644 --- a/src/bots.mjs +++ b/src/bots.mjs @@ -15,7 +15,6 @@ // | select $d.request.user_agent as ua, $d.rum.user_agent as ua2 // | filter $d.ua2 == 'bot' // | groupby ua agg count() as count -// | orderby count desc; export const bots = { Ads: [ { @@ -39,6 +38,10 @@ export const bots = { user_agent: 'AmazonProductDiscovery/1.0', regex: 'AmazonProductDiscovery', }, + { + user_agent: 'Storebot-Google/1.0', + regex: 'Storebot-Google', + }, ], Quality: [ { @@ -57,9 +60,16 @@ export const bots = { user_agent: 'OSVCLinkChecker/1.0', regex: 'OSVCLinkChecker', }, + { + user_agent: 'TagInspector/500.1', + regex: 'TagInspector', + }, + { + user_agent: 'SiteCheck-sitecrawl by Siteimprove.com', + regex: 'Siteimprove', + }, ], Monitoring: [ - { user_agent: 'DatadogSynthetics', regex: 'DatadogSynthetics', @@ -100,12 +110,32 @@ export const bots = { user_agent: 'RuxitSynthetic various versions', regex: 'RuxitSynthetic', }, + { + user_agent: 'StatusCake_Pagespeed_Indev', + regex: 'StatusCake_Pagespeed', + }, + { + user_agent: 'One.Shop New Relic Synthetics', + regex: 'New Relic Synthetics', + }, + { + user_agent: 'Splunk Synthetics', + regex: 'Splunk Synthetics', + }, + { + user_agent: 'VisualMonitoring/0.1', + regex: 'VisualMonitoring', + }, ], Social: [ { user_agent: 'facebookexternalhit/1.1', regex: 'facebookexternalhit', }, + { + user_agent: 'Pinterestbot/1.0', + regex: 'Pinterestbot', + }, ], SEO: [ { @@ -158,11 +188,51 @@ export const bots = { user_agent: 'Googlebot/2.1', regex: 'Googlebot', }, + { + user_agent: 'Applebot/0.1', + regex: 'Applebot', + }, + { + user_agent: 'AddSearchBot/1.0', + regex: 'AddSearchBot', + }, + { + user_agent: 'ClarityBot/9.0', + regex: 'ClarityBot', + }, ], Security: [ { user_agent: 'Popetech-Scanbot/1.0', regex: 'Popetech-Scanbot', }, + { + user_agent: 'Detectify', + regex: 'Detectify', + }, + { + user_agent: 'Probely', + regex: 'Probely', + }, + ], + Compliance: [ + { + user_agent: 'Cookiebot/1.0', + regex: 'Cookiebot', + }, + { + user_agent: 'OneTrustBot', + regex: 'OneTrustBot', + }, + ], + Archive: [ + { + user_agent: 'PageFreezer', + regex: 'PageFreezer', + }, + { + user_agent: 'mirrorweb.com', + regex: 'mirrorweb.com', + }, ], }; From 14949a9d449f3f03658da8d2fe1065a67e2922da Mon Sep 17 00:00:00 2001 From: Lars Trieloff Date: Thu, 31 Oct 2024 16:05:02 +0100 Subject: [PATCH 2/4] feat(bots): add more bots, based on extensive list of bots this is based on the longer scan, about 1000 different bots, and adds the Translation, Jobs, and AI categories. --- src/bots.mjs | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/bots.mjs b/src/bots.mjs index 7037222..2180291 100644 --- a/src/bots.mjs +++ b/src/bots.mjs @@ -126,6 +126,18 @@ export const bots = { user_agent: 'VisualMonitoring/0.1', regex: 'VisualMonitoring', }, + { + user_agent: 'ERAMonitor', + regex: 'ERAMonitor', + }, + { + user_agent: 'WatchMouse', + regex: 'watchmouse.com', + }, + { + user_agent: 'Elastic/Synthetics', + regex: 'Elastic/Synthetics', + }, ], Social: [ { @@ -136,6 +148,14 @@ export const bots = { user_agent: 'Pinterestbot/1.0', regex: 'Pinterestbot', }, + { + user_agent: 'Slackbot-LinkExpanding', + regex: 'Slackbot-LinkExpanding', + }, + { + user_agent: 'Iframely/1.3.1', + regex: 'Iframely', + }, ], SEO: [ { @@ -174,7 +194,7 @@ export const bots = { }, { user_agent: 'YandexRenderResourcesBot/1.0', - regex: 'YandexRenderResourcesBot', + regex: 'Yandex', }, { user_agent: 'MS-Search Crawler', @@ -200,6 +220,10 @@ export const bots = { user_agent: 'ClarityBot/9.0', regex: 'ClarityBot', }, + { + user_agent: 'PetalBot', + regex: 'PetalBot', + }, ], Security: [ { @@ -214,6 +238,11 @@ export const bots = { user_agent: 'Probely', regex: 'Probely', }, + { + user_agent: 'VirusTotalBot', + regex: 'VirusTotalBot', + }, + ], Compliance: [ { @@ -224,6 +253,14 @@ export const bots = { user_agent: 'OneTrustBot', regex: 'OneTrustBot', }, + { + user_agent: 'CookieReports.com', + regex: 'CookieReports.com', + }, + { + user_agent: 'ActiveComply', + regex: 'ActiveComply', + }, ], Archive: [ { @@ -235,4 +272,20 @@ export const bots = { regex: 'mirrorweb.com', }, ], + Translation: [ + { + user_agent: 'TransSync', + regex: 'TransSync.*motionpoint', + }, + { + user_agent: 'WovnCrawler', + regex: 'WovnCrawler', + }, + ], + AI: [ + { + user_agent: 'Ai2Bot-Dolma', + regex: 'Ai2Bot-Dolma', + }, + ], }; From 4b49da55000f08134715b7fc14d6070eaef31eee Mon Sep 17 00:00:00 2001 From: Lars Trieloff Date: Fri, 1 Nov 2024 10:05:35 +0100 Subject: [PATCH 3/4] feat(bots): add more AI bots, based on ai.robots.txt - some of these overlap with search - some are archival crawlers also used for AI training - some don't have enough traffic to be worth adding here --- src/bots.mjs | 61 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/src/bots.mjs b/src/bots.mjs index 2180291..9067e2a 100644 --- a/src/bots.mjs +++ b/src/bots.mjs @@ -183,6 +183,54 @@ export const bots = { regex: 'https://deepcrawl.com/bot', }, ], + // There is some overlap between AI and Search, and some companies like Apple, + // Google, and Meta have many bots that are used for different purposes. + // the repo https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.json + // has a longer list of AI bots, but many of them don't have enough traffic + // to be worth adding here. In addition, the repo seems to err on the side of + // classifying crawlers as AI bots. + AI: [ + { + user_agent: 'Ai2Bot-Dolma', + regex: 'Ai2Bot-Dolma', + }, + { + user_agent: 'GPTBot', + regex: 'GPTBot', + }, + { + user_agent: 'Claude-Web', + regex: 'Claude-Web', + }, + { + user_agent: 'anthropic-ai', + regex: 'anthropic-ai', + }, + { + user_agent: 'Google-Extended', + regex: 'Google-Extended', + }, + { + user_agent: 'FacebookBot', + regex: 'FacebookBot', + }, + { + user_agent: 'Applebot-Extended', + regex: 'Applebot.*Extended', + }, + { + user_agent: 'Meta-ExternalAgent', + regex: 'Meta-ExternalAgent', + }, + { + user_agent: 'PerplexityBot', + regex: 'PerplexityBot', + }, + { + user_agent: 'YouBot', + regex: 'YouBot', + }, + ], Search: [ { user_agent: 'CoveoBot/2.0', @@ -271,6 +319,13 @@ export const bots = { user_agent: 'mirrorweb.com', regex: 'mirrorweb.com', }, + { + // CCBot is a bot that crawls the web to find content for the Common Crawl project + // common crawl is used to train many AI datasets, but the bot is not an AI crawler + // per se, so it's not included in the AI category + user_agent: 'CCBot', + regex: 'CCBot', + }, ], Translation: [ { @@ -282,10 +337,4 @@ export const bots = { regex: 'WovnCrawler', }, ], - AI: [ - { - user_agent: 'Ai2Bot-Dolma', - regex: 'Ai2Bot-Dolma', - }, - ], }; From 293328bfa9f18ee6e9238b53259f1794fb6d3e45 Mon Sep 17 00:00:00 2001 From: Lars Trieloff Date: Mon, 4 Nov 2024 09:39:53 +0100 Subject: [PATCH 4/4] chore(bots): restore deleted comment line --- src/bots.mjs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/bots.mjs b/src/bots.mjs index 9067e2a..1b5d4bc 100644 --- a/src/bots.mjs +++ b/src/bots.mjs @@ -15,6 +15,7 @@ // | select $d.request.user_agent as ua, $d.rum.user_agent as ua2 // | filter $d.ua2 == 'bot' // | groupby ua agg count() as count +// | sort by count desc export const bots = { Ads: [ {