From eb989f0c74bed32e7c1dc6d3bb7c2817d2cdbcc3 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Mon, 7 Oct 2024 05:40:12 +0200 Subject: [PATCH 1/6] Improves detection for generic bots --- Tests/fixtures/bots.yml | 4 ++++ regexes/bots.yml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 5398638377..591bc675f4 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -8309,3 +8309,7 @@ producer: name: Immutable, SNC url: https://ohdear.app/ +- + user_agent: Mozilla/5.0 Keydrop + bot: + name: Generic Bot diff --git a/regexes/bots.yml b/regexes/bots.yml index d5efb132a6..c4fcfe0e0f 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4823,7 +4823,7 @@ url: 'https://ohdear.app/' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordup|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordu|Keydrop|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections From 8c8d0ef453ebe6ccd575b825680cb687cbe96e03 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Mon, 7 Oct 2024 05:42:17 +0200 Subject: [PATCH 2/6] Adds detection for Inspici --- Tests/fixtures/bots.yml | 9 +++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 591bc675f4..850e46410a 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -8313,3 +8313,12 @@ user_agent: Mozilla/5.0 Keydrop bot: name: Generic Bot +- + user_agent: Mozilla/6.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Inspici (www.inspici.com) + bot: + name: Inspici + category: Crawler + url: https://www.inspici.com/ + producer: + name: Inspici, LLC + url: https://www.inspici.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index c4fcfe0e0f..85db791ad1 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4822,6 +4822,14 @@ name: 'Immutable, SNC' url: 'https://ohdear.app/' +- regex: 'Inspici' + name: 'Inspici' + category: 'Crawler' + url: 'https://www.inspici.com/' + producer: + name: 'Inspici, LLC' + url: 'https://www.inspici.com/' + # Generic bots - regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordu|Keydrop|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot' From 0935d5182616f7c80c74ffb7be30d924b2d05adb Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Mon, 7 Oct 2024 05:57:34 +0200 Subject: [PATCH 3/6] Adds detection for Meta-ExternalAgent --- Tests/fixtures/bots.yml | 18 ++++++++++++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 850e46410a..fe5e00197a 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -8322,3 +8322,21 @@ producer: name: Inspici, LLC url: https://www.inspici.com/ +- + user_agent: meta-externalagent/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler) + bot: + name: Meta-ExternalAgent + category: Crawler + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers + producer: + name: Meta Platforms, Inc. + url: https://www.meta.com/ +- + user_agent: meta-externalagent/1.1 + bot: + name: Meta-ExternalAgent + category: Crawler + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers + producer: + name: Meta Platforms, Inc. + url: https://www.meta.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 85db791ad1..f72801fe25 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -599,6 +599,14 @@ name: 'Meta Platforms, Inc.' url: 'https://www.meta.com/' +- regex: 'meta-externalagent' + name: 'Meta-ExternalAgent' + category: 'Crawler' + url: 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers' + producer: + name: 'Meta Platforms, Inc.' + url: 'https://www.meta.com/' + - regex: 'FacebookBot/[\d.]+' name: 'FacebookBot' category: 'Crawler' From 9e6dc62235a8f693077560dd8eceb0e891d7f8f1 Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Mon, 7 Oct 2024 05:59:44 +0200 Subject: [PATCH 4/6] Adds detection for Meta-ExternalFetcher --- Tests/fixtures/bots.yml | 18 ++++++++++++++++++ regexes/bots.yml | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index fe5e00197a..c025370ad6 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -8340,3 +8340,21 @@ producer: name: Meta Platforms, Inc. url: https://www.meta.com/ +- + user_agent: meta-externalfetcher/1.1 (+https://developers.facebook.com/docs/sharing/webmasters/crawler) + bot: + name: Meta-ExternalFetcher + category: Social Media Agent + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers + producer: + name: Meta Platforms, Inc. + url: https://www.meta.com/ +- + user_agent: meta-externalfetcher/1.1 + bot: + name: Meta-ExternalFetcher + category: Social Media Agent + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers + producer: + name: Meta Platforms, Inc. + url: https://www.meta.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index f72801fe25..75c7e2bdb0 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -607,6 +607,14 @@ name: 'Meta Platforms, Inc.' url: 'https://www.meta.com/' +- regex: 'meta-externalfetcher' + name: 'Meta-ExternalFetcher' + category: 'Social Media Agent' + url: 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers' + producer: + name: 'Meta Platforms, Inc.' + url: 'https://www.meta.com/' + - regex: 'FacebookBot/[\d.]+' name: 'FacebookBot' category: 'Crawler' From 3eb90f6025525665f30aae316cca8c81d61c6b9b Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Mon, 7 Oct 2024 06:01:27 +0200 Subject: [PATCH 5/6] Fix url for Facebook crawlers --- Tests/fixtures/bots.yml | 10 +++++----- regexes/bots.yml | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index c025370ad6..7df4111ee3 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -950,7 +950,7 @@ bot: name: Facebook Crawler category: Social Media Agent - url: https://developers.facebook.com/docs/sharing/webmasters/crawler/ + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers producer: name: Meta Platforms, Inc. url: https://www.meta.com/ @@ -959,7 +959,7 @@ bot: name: Facebook Crawler category: Social Media Agent - url: https://developers.facebook.com/docs/sharing/webmasters/crawler/ + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers producer: name: Meta Platforms, Inc. url: https://www.meta.com/ @@ -968,7 +968,7 @@ bot: name: Facebook Crawler category: Social Media Agent - url: https://developers.facebook.com/docs/sharing/webmasters/crawler/ + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers producer: name: Meta Platforms, Inc. url: https://www.meta.com/ @@ -4642,7 +4642,7 @@ bot: name: Facebook Crawler category: Social Media Agent - url: https://developers.facebook.com/docs/sharing/webmasters/crawler/ + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers producer: name: Meta Platforms, Inc. url: https://www.meta.com/ @@ -7762,7 +7762,7 @@ bot: name: Facebook Crawler category: Social Media Agent - url: https://developers.facebook.com/docs/sharing/webmasters/crawler/ + url: https://developers.facebook.com/docs/sharing/webmasters/web-crawlers producer: name: Meta Platforms, Inc. url: https://www.meta.com/ diff --git a/regexes/bots.yml b/regexes/bots.yml index 75c7e2bdb0..9fa82138ab 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -594,7 +594,7 @@ - regex: 'facebook(?:catalog|externalhit|externalua|platform|scraper)' name: 'Facebook Crawler' category: 'Social Media Agent' - url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/' + url: 'https://developers.facebook.com/docs/sharing/webmasters/web-crawlers' producer: name: 'Meta Platforms, Inc.' url: 'https://www.meta.com/' From 152443ef66bbd8159898bc3e3437fb2ea8eae5ad Mon Sep 17 00:00:00 2001 From: Liviu-Mihail Concioiu Date: Mon, 7 Oct 2024 06:08:54 +0200 Subject: [PATCH 6/6] Fix regex --- regexes/bots.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regexes/bots.yml b/regexes/bots.yml index 9fa82138ab..20089fa273 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -4847,7 +4847,7 @@ url: 'https://www.inspici.com/' # Generic bots -- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordu|Keydrop|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' +- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordup|Keydrop|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$' name: 'Generic Bot' # Generic detections