Skip to content

Commit

Permalink
Merge pull request #413 from adobe/botsupdate
Browse files Browse the repository at this point in the history
feat(bots): add more bots, based on 2024-10-31 data
  • Loading branch information
trieloff authored Nov 11, 2024
2 parents fe5c416 + 293328b commit f4383ba
Showing 1 changed file with 176 additions and 3 deletions.
179 changes: 176 additions & 3 deletions src/bots.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// | select $d.request.user_agent as ua, $d.rum.user_agent as ua2
// | filter $d.ua2 == 'bot'
// | groupby ua agg count() as count
// | orderby count desc;
// | sort by count desc
export const bots = {
Ads: [
{
Expand All @@ -39,6 +39,10 @@ export const bots = {
user_agent: 'AmazonProductDiscovery/1.0',
regex: 'AmazonProductDiscovery',
},
{
user_agent: 'Storebot-Google/1.0',
regex: 'Storebot-Google',
},
],
Quality: [
{
Expand All @@ -57,9 +61,16 @@ export const bots = {
user_agent: 'OSVCLinkChecker/1.0',
regex: 'OSVCLinkChecker',
},
{
user_agent: 'TagInspector/500.1',
regex: 'TagInspector',
},
{
user_agent: 'SiteCheck-sitecrawl by Siteimprove.com',
regex: 'Siteimprove',
},
],
Monitoring: [

{
user_agent: 'DatadogSynthetics',
regex: 'DatadogSynthetics',
Expand Down Expand Up @@ -100,12 +111,52 @@ export const bots = {
user_agent: 'RuxitSynthetic various versions',
regex: 'RuxitSynthetic',
},
{
user_agent: 'StatusCake_Pagespeed_Indev',
regex: 'StatusCake_Pagespeed',
},
{
user_agent: 'One.Shop New Relic Synthetics',
regex: 'New Relic Synthetics',
},
{
user_agent: 'Splunk Synthetics',
regex: 'Splunk Synthetics',
},
{
user_agent: 'VisualMonitoring/0.1',
regex: 'VisualMonitoring',
},
{
user_agent: 'ERAMonitor',
regex: 'ERAMonitor',
},
{
user_agent: 'WatchMouse',
regex: 'watchmouse.com',
},
{
user_agent: 'Elastic/Synthetics',
regex: 'Elastic/Synthetics',
},
],
Social: [
{
user_agent: 'facebookexternalhit/1.1',
regex: 'facebookexternalhit',
},
{
user_agent: 'Pinterestbot/1.0',
regex: 'Pinterestbot',
},
{
user_agent: 'Slackbot-LinkExpanding',
regex: 'Slackbot-LinkExpanding',
},
{
user_agent: 'Iframely/1.3.1',
regex: 'Iframely',
},
],
SEO: [
{
Expand Down Expand Up @@ -133,6 +184,54 @@ export const bots = {
regex: 'https://deepcrawl.com/bot',
},
],
// There is some overlap between AI and Search, and some companies like Apple,
// Google, and Meta have many bots that are used for different purposes.
// the repo https://github.com/ai-robots-txt/ai.robots.txt/blob/main/robots.json
// has a longer list of AI bots, but many of them don't have enough traffic
// to be worth adding here. In addition, the repo seems to err on the side of
// classifying crawlers as AI bots.
AI: [
{
user_agent: 'Ai2Bot-Dolma',
regex: 'Ai2Bot-Dolma',
},
{
user_agent: 'GPTBot',
regex: 'GPTBot',
},
{
user_agent: 'Claude-Web',
regex: 'Claude-Web',
},
{
user_agent: 'anthropic-ai',
regex: 'anthropic-ai',
},
{
user_agent: 'Google-Extended',
regex: 'Google-Extended',
},
{
user_agent: 'FacebookBot',
regex: 'FacebookBot',
},
{
user_agent: 'Applebot-Extended',
regex: 'Applebot.*Extended',
},
{
user_agent: 'Meta-ExternalAgent',
regex: 'Meta-ExternalAgent',
},
{
user_agent: 'PerplexityBot',
regex: 'PerplexityBot',
},
{
user_agent: 'YouBot',
regex: 'YouBot',
},
],
Search: [
{
user_agent: 'CoveoBot/2.0',
Expand All @@ -144,7 +243,7 @@ export const bots = {
},
{
user_agent: 'YandexRenderResourcesBot/1.0',
regex: 'YandexRenderResourcesBot',
regex: 'Yandex',
},
{
user_agent: 'MS-Search Crawler',
Expand All @@ -158,11 +257,85 @@ export const bots = {
user_agent: 'Googlebot/2.1',
regex: 'Googlebot',
},
{
user_agent: 'Applebot/0.1',
regex: 'Applebot',
},
{
user_agent: 'AddSearchBot/1.0',
regex: 'AddSearchBot',
},
{
user_agent: 'ClarityBot/9.0',
regex: 'ClarityBot',
},
{
user_agent: 'PetalBot',
regex: 'PetalBot',
},
],
Security: [
{
user_agent: 'Popetech-Scanbot/1.0',
regex: 'Popetech-Scanbot',
},
{
user_agent: 'Detectify',
regex: 'Detectify',
},
{
user_agent: 'Probely',
regex: 'Probely',
},
{
user_agent: 'VirusTotalBot',
regex: 'VirusTotalBot',
},

],
Compliance: [
{
user_agent: 'Cookiebot/1.0',
regex: 'Cookiebot',
},
{
user_agent: 'OneTrustBot',
regex: 'OneTrustBot',
},
{
user_agent: 'CookieReports.com',
regex: 'CookieReports.com',
},
{
user_agent: 'ActiveComply',
regex: 'ActiveComply',
},
],
Archive: [
{
user_agent: 'PageFreezer',
regex: 'PageFreezer',
},
{
user_agent: 'mirrorweb.com',
regex: 'mirrorweb.com',
},
{
// CCBot is a bot that crawls the web to find content for the Common Crawl project
// common crawl is used to train many AI datasets, but the bot is not an AI crawler
// per se, so it's not included in the AI category
user_agent: 'CCBot',
regex: 'CCBot',
},
],
Translation: [
{
user_agent: 'TransSync',
regex: 'TransSync.*motionpoint',
},
{
user_agent: 'WovnCrawler',
regex: 'WovnCrawler',
},
],
};

0 comments on commit f4383ba

Please sign in to comment.