From b04b4383e6263976c5ffeade3c726dd0db70a11f Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Fri, 31 Aug 2018 16:06:22 +0300 Subject: [PATCH] Version 2 Added ----- * Built-in cache with file-path and max-age * Automatically refresh from source (external repository) Removed ------- * Rake-tasks * A very large copy of external UA-bots list * `rack` as dependency Changed ------- * `voight_kampff/rack_request` isn't required by default * Default path for cache is now in `./tmp` directory instead of `./config` Development ----------- * [EditorConfig](https://editorconfig.org/) added * [RuboCop](https://github.com/rubocop-hq/rubocop) added and offenses fixed * Drop support for Ruby <= 2.2 (they even will not get security patches) * Ignore built gems for git --- .editorconfig | 9 + .gitignore | 1 + .rubocop.yml | 13 + .ruby-version | 1 - .travis.yml | 6 +- Gemfile | 2 + README.md | 40 +- config.ru | 2 + config/crawler-user-agents.json | 3436 ----------------- lib/tasks/voight_kampff.rake | 18 - lib/voight_kampff.rb | 11 +- lib/voight_kampff/engine.rb | 5 +- lib/voight_kampff/methods.rb | 19 +- lib/voight_kampff/rack_request.rb | 6 +- lib/voight_kampff/test.rb | 111 +- lib/voight_kampff/version.rb | 4 +- .../controllers/replicants_controller_spec.rb | 5 +- spec/fixtures/crawler-user-agents.json | 43 + .../app/controllers/replicants_controller.rb | 13 +- spec/internal/config/routes.rb | 2 + spec/lib/voight_kampff/rack_request_spec.rb | 8 +- spec/lib/voight_kampff/test_spec.rb | 134 +- spec/lib/voight_kampff_spec.rb | 6 +- spec/spec_helper.rb | 13 +- spec/support/humans.rb | 4 +- spec/support/replicants.rb | 4 +- voight_kampff.gemspec | 27 +- 27 files changed, 387 insertions(+), 3556 deletions(-) create mode 100644 .editorconfig create mode 100644 .rubocop.yml delete mode 100644 .ruby-version delete mode 100644 config/crawler-user-agents.json delete mode 100644 lib/tasks/voight_kampff.rake create mode 100644 spec/fixtures/crawler-user-agents.json diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..c6c8b36 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*] +indent_style = space +indent_size = 2 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/.gitignore b/.gitignore index 76f634c..8d18ff7 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ Gemfile.lock *.rbc coverage doc +*.gem diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..5788c50 --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,13 @@ +AllCops: + TargetRubyVersion: 2.4 + +Layout/MultilineMethodCallIndentation: + EnforcedStyle: indented + +Metrics/BlockLength: + Exclude: + - spec/**/* + +Metrics/LineLength: + Exclude: + - spec/support/* diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index 097a15a..0000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -2.6.2 diff --git a/.travis.yml b/.travis.yml index 61af5b7..bcf800c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,8 @@ language: ruby rvm: - - 2.4.6 - - 2.5.5 - - 2.6.2 + - 2.4 + - 2.5 + - 2.6 script: bundle exec rspec before_install: # fixes Travis CI error: NoMethodError: undefined method `spec' for nil:NilClass diff --git a/Gemfile b/Gemfile index fa75df1..7f4f5e9 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,5 @@ +# frozen_string_literal: true + source 'https://rubygems.org' gemspec diff --git a/README.md b/README.md index 2ce5c31..390da19 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,11 @@ Installation Configuration ------------- -A JSON file is used to match [user agent strings](http://simplyfast.info/browser) to a list of known bots. +By default, this gem download [updatable list of known bots](https://github.com/monperrus/crawler-user-agents) and cache it into the `./tmp` directory. -If you'd like to use an [updated list](https://github.com/monperrus/crawler-user-agents) or make your own customizations, run `rake voight_kampff:import_user_agents`. This will download a `crawler-user-agents.json` file into the `./config` directory. +You can disable file cache with `VoightKampff::Test.cache_path = nil`. + +Default TTL of cache is 24 hours. You can change it with `VoightKampff::Test.max_age = number_of_seconds`. __Note:__ The pattern entries in the JSON file are evaluated as [regular expressions](http://en.wikipedia.org/wiki/Regular_expression). @@ -23,21 +25,26 @@ Usage ----- There are three ways to use Voight-Kampff -1. Through Rack::Request such as in your [Ruby on Rails](http://rubyonrails.org) controllers: - `request.bot?` +1. Through `Rack::Request` in your app such as [Ruby on Rails](http://rubyonrails.org): + ```ruby + require 'voight_kampff/rack_request' + + request.bot? + ``` -2. Through the `VoightKampff` module: +2. Through the `VoightKampff` module: `VoightKampff.bot? 'your user agent string'` -3. Through a `VoightKampff::Test` instance: +3. Through a `VoightKampff::Test` instance: `VoightKampff::Test.new('your user agent string').bot?` -All of the above examples accept `human?` and `bot?` methods. All of these methods will return `true` or `false`. +All of the above examples accept `human?` and `bot?` methods. +All of these methods will return `true` or `false`. Upgrading to version 1.0 ------------------------ -Version 1.0 uses a new source for a list of bot user agent strings since the old source was no longer maintained. This new source, unfortuately, does not include as much detail. Therefore the following methods have been deprecated: +Version 1.0 uses a new source for a list of bot user agent strings since the old source was no longer maintained. This new source, unfortunately, does not include as much detail. Therefore the following methods have been deprecated: - `#browser?` - `#checker?` - `#downloader?` @@ -49,14 +56,21 @@ In general the `#bot?` command tends to include all of these and I'm sure it's u Also, the gem no longer extends `ActionDispatch::Request` instead it extends `Rack::Request` which `ActionDispatch::Request` inherits from. This allows the same functionality for Rails while opening the gem up to other rack-based projects. +Upgrading to version 2.0 +------------------------ + +Version 2.0 uses a built-in cache instead of `rake` tasks. + +Also, you have to explicitly require `voight_kampff/rack_request` for `Rack::Request` patching. + FAQ --- -__Q:__ __What's with the name?__ -__A:__ It's the [machine in Blade Runner](http://en.wikipedia.org/wiki/Blade_Runner#Voigt-Kampff_machine) that is used to test whether someone is a human or a replicant. +__Q:__ __What's with the name?__ +__A:__ It's the [machine in Blade Runner](http://en.wikipedia.org/wiki/Blade_Runner#Voight-Kampff_machine) that is used to test whether someone is a human or a replicant. -__Q:__ __I've found a bot that isn't being matched__ +__Q:__ __I've found a bot that isn't being matched__ __A:__ The list is being pulled from [github.com/monperrus/crawler-user-agents](https://github.com/monperrus/crawler-user-agents). -If you'd like to have entries added to the list, please create a pull request with that project. Once that pull request is merged, feel free to create an issue here and I'll release a new gem version with the updated list. In the meantime you can always run `rake voight_kampff:import_user_agents` on your project to get that updated list. +If you'd like to have entries added to the list, please create a pull request with that project. Once that pull request is merged, you'll be get the new version at the next cache update. In the meantime you can always manually flush the cache in your project to get that updated list. __Q:__ __Why don't you use the user agent list from ______________ If you know of a better source for a list of bot user agent strings, please create an issue and let me know. I'm open to switching to a better source or supporting multiple sources. There are others out there but I like the openness of monperrus' list. @@ -67,7 +81,7 @@ Thanks to [github.com/monperrus/crawler-user-agents](https://github.com/monperru Contributing ------------ -PR without tests will not get merged, Make sure you write tests for api and rails app. +PR without tests will not get merged, Make sure you write tests for API and Rails app. Feel free to ask for help, if you do not know how to write a determined test. Running Tests? diff --git a/config.ru b/config.ru index d49744b..2ba7766 100644 --- a/config.ru +++ b/config.ru @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'rubygems' require 'bundler' diff --git a/config/crawler-user-agents.json b/config/crawler-user-agents.json deleted file mode 100644 index c71a48e..0000000 --- a/config/crawler-user-agents.json +++ /dev/null @@ -1,3436 +0,0 @@ -[ - { - "pattern": "Googlebot\\/", - "url": "http://www.google.com/bot.html", - "instances": [ - "Googlebot/2.1 (+http://www.google.com/bot.html)", - "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36", - "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview Analytics) Chrome/27.0.1453 Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" - ] - } - , - { - "pattern": "Googlebot-Mobile", - "instances": [ - "DoCoMo/2.0 N905i(c100;TB;W24H16) (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)", - "Nokia6820/2.0 (4.83) Profile/MIDP-1.0 Configuration/CLDC-1.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)", - "SAMSUNG-SGH-E250/1.0 Profile/MIDP-2.0 Configuration/CLDC-1.1 UP.Browser/6.2.3.3.c.1.101 (GUI) MMP/2.0 (compatible; Googlebot-Mobile/2.1; +http://www.google.com/bot.html)" - ] - } - , - { - "pattern": "Googlebot-Image", - "instances": [ - "Googlebot-Image/1.0" - ] - } - , - { - "pattern": "Googlebot-News", - "instances": [ - "Googlebot-News" - ] - } - , - { - "pattern": "Googlebot-Video", - "instances": [ - "Googlebot-Video/1.0" - ] - } - , - { - "pattern": "AdsBot-Google([^-]|$)", - "url": "https://support.google.com/webmasters/answer/1061943?hl=en", - "instances": [ - "AdsBot-Google (+http://www.google.com/adsbot.html)" - ] - } - , - { - "pattern": "AdsBot-Google-Mobile", - "addition_date": "2017/08/21", - "url": "https://support.google.com/adwords/answer/2404197", - "instances": [ - "AdsBot-Google-Mobile-Apps", - "Mozilla/5.0 (Linux; Android 5.0; SM-G920A) AppleWebKit (KHTML, like Gecko) Chrome Mobile Safari (compatible; AdsBot-Google-Mobile; +http://www.google.com/mobile/adsbot.html)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 (compatible; AdsBot-Google-Mobile; +http://www.google.com/mobile/adsbot.html)" - ] - } - , - { - "pattern": "Feedfetcher-Google", - "addition_date": "2018/06/27", - "url": "https://support.google.com/webmasters/answer/178852", - "instances": [ - "Feedfetcher-Google; (+http://www.google.com/feedfetcher.html; 1 subscribers; feed-id=728742641706423)" - ] - } - , - { - "pattern": "Mediapartners-Google", - "url": "https://support.google.com/webmasters/answer/1061943?hl=en", - "instances": [ - "Mediapartners-Google", - "Mozilla/5.0 (compatible; MSIE or Firefox mutant; not on Windows server;) Daumoa/4.0 (Following Mediapartners-Google)", - "Mozilla/5.0 (iPhone; U; CPU iPhone OS 10_0 like Mac OS X; en-us) AppleWebKit/602.1.38 (KHTML, like Gecko) Version/10.0 Mobile/14A5297c Safari/602.1 (compatible; Mediapartners-Google/2.1; +http://www.google.com/bot.html)", - "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_1 like Mac OS X; en-us) AppleWebKit/532.9 (KHTML, like Gecko) Version/4.0.5 Mobile/8B117 Safari/6531.22.7 (compatible; Mediapartners-Google/2.1; +http://www.google.com/bot.html)" - ] - } - , - { - "pattern": "Mediapartners \\(Googlebot\\)", - "addition_date": "2017/08/08", - "url": "https://support.google.com/webmasters/answer/1061943?hl=en", - "instances": [] - } - , - { - "pattern": "APIs-Google", - "addition_date": "2017/08/08", - "url": "https://support.google.com/webmasters/answer/1061943?hl=en", - "instances": [ - "APIs-Google (+https://developers.google.com/webmasters/APIs-Google.html)" - ] - } - , - { - "pattern": "bingbot", - "url": "http://www.bing.com/bingbot.htm", - "instances": [ - "Mozilla/5.0 (Windows Phone 8.1; ARM; Trident/7.0; Touch; rv:11.0; IEMobile/11.0; NOKIA; Lumia 530) like Gecko (compatible; adidxbot/2.0; +http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (compatible; adidxbot/2.0; http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (compatible; adidxbot/2.0; +http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm", - "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) SitemapProbe", - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; adidxbot/2.0; http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; adidxbot/2.0; +http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", - "Mozilla/5.0 (seoanalyzer; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" - ] - } - , - { - "pattern": "Slurp", - "url": "http://help.yahoo.com/help/us/ysearch/slurp", - "instances": [ - "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)", - "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)", - "Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)" - ] - } - , - { - "pattern": "[wW]get", - "instances": [ - "WGETbot/1.0 (+http://wget.alanreed.org)", - "Wget/1.14 (linux-gnu)" - ] - } - , - { - "pattern": "curl", - "instances": [ - "eCairn-Grabber/1.0 (+http://ecairn.com/grabber) curl/7.15" - ] - } - , - { - "pattern": "LinkedInBot", - "instances": [ - "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/3.1 +http://www.linkedin.com)", - "LinkedInBot/1.0 (compatible; Mozilla/5.0; Jakarta Commons-HttpClient/4.3 +http://www.linkedin.com)" - ] - } - , - { - "pattern": "Python-urllib", - "instances": [ - "Python-urllib/2.5", - "Python-urllib/2.5", - "Python-urllib/2.6", - "Python-urllib/2.7", - "Python-urllib/3.1", - "Python-urllib/3.2", - "Python-urllib/3.3", - "Python-urllib/3.4", - "Python-urllib/3.5", - "Python-urllib/3.6" - ] - } - , - { - "pattern": "python-requests", - "addition_date": "2018/05/27", - "instances": [ - "python-requests/2.18.4" - ] - } - , - { - "pattern": "libwww", - "instances": [ - "2Bone_LinkChecker/1.0 libwww-perl/6.03", - "2Bone_LinkChkr/1.0 libwww-perl/6.03", - "W3C-checklink/2.90 libwww-perl/5.64", - "W3C-checklink/3.6.2.3 libwww-perl/5.64", - "W3C-checklink/4.2 [4.20] libwww-perl/5.803", - "W3C-checklink/4.2.1 [4.21] libwww-perl/5.803", - "W3C-checklink/4.3 [4.42] libwww-perl/5.805", - "W3C-checklink/4.3 [4.42] libwww-perl/5.808", - "W3C-checklink/4.3 [4.42] libwww-perl/5.820", - "W3C-checklink/4.5 [4.154] libwww-perl/5.823", - "W3C-checklink/4.5 [4.160] libwww-perl/5.823", - "amibot - http://www.amidalla.de - tech@amidalla.com libwww-perl/5.831" - ] - } - , - { - "pattern": "httpunit", - "instances": [ - "httpunit/1.x" - ] - } - , - { - "pattern": "nutch", - "instances": [ - "NutchCVS/0.7.1 (Nutch; http://lucene.apache.org/nutch/bot.html; nutch-agent@lucene.apache.org)", - "istellabot-nutch/Nutch-1.10" - ] - } - , - { - "pattern": "Go-http-client", - "addition_date": "2016/03/26", - "url": "https://golang.org/pkg/net/http/", - "instances": [ - "Go-http-client/1.1" - ] - } - , - { - "pattern": "phpcrawl", - "addition_date": "2012-09/17", - "url": "http://phpcrawl.cuab.de/", - "instances": [ - "phpcrawl" - ] - } - , - { - "pattern": "msnbot", - "url": "http://search.msn.com/msnbot.htm", - "instances": [ - "adidxbot/1.1 (+http://search.msn.com/msnbot.htm)", - "adidxbot/2.0 (+http://search.msn.com/msnbot.htm)", - "librabot/1.0 (+http://search.msn.com/msnbot.htm)", - "librabot/2.0 (+http://search.msn.com/msnbot.htm)", - "msnbot-NewsBlogs/2.0b (+http://search.msn.com/msnbot.htm)", - "msnbot-UDiscovery/2.0b (+http://search.msn.com/msnbot.htm)", - "msnbot-media/1.0 (+http://search.msn.com/msnbot.htm)", - "msnbot-media/1.1 (+http://search.msn.com/msnbot.htm)", - "msnbot-media/2.0b (+http://search.msn.com/msnbot.htm)", - "msnbot/1.0 (+http://search.msn.com/msnbot.htm)", - "msnbot/1.1 (+http://search.msn.com/msnbot.htm)", - "msnbot/2.0b (+http://search.msn.com/msnbot.htm)", - "msnbot/2.0b (+http://search.msn.com/msnbot.htm).", - "msnbot/2.0b (+http://search.msn.com/msnbot.htm)._" - ] - } - , - { - "pattern": "jyxobot", - "instances": [] - } - , - { - "pattern": "FAST-WebCrawler", - "instances": [ - "FAST-WebCrawler/3.6/FirstPage (atw-crawler at fast dot no;http://fast.no/support/crawler.asp)", - "FAST-WebCrawler/3.7 (atw-crawler at fast dot no; http://fast.no/support/crawler.asp)", - "FAST-WebCrawler/3.7/FirstPage (atw-crawler at fast dot no;http://fast.no/support/crawler.asp)", - "FAST-WebCrawler/3.8" - ] - } - , - { - "pattern": "FAST Enterprise Crawler", - "instances": [ - "FAST Enterprise Crawler 6 / Scirus scirus-crawler@fast.no; http://www.scirus.com/srsapp/contactus/", - "FAST Enterprise Crawler 6 used by Schibsted (webcrawl@schibstedsok.no)" - ] - } - , - { - "pattern": "BIGLOTRON", - "instances": [ - "BIGLOTRON (Beta 2;GNU/Linux)" - ] - } - , - { - "pattern": "Teoma", - "instances": [ - "Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://sp.ask.com/docs/about/tech_crawling.html)", - "Mozilla/2.0 (compatible; Ask Jeeves/Teoma; +http://about.ask.com/en/docs/about/webmasters.shtml)" - ], - "url": "http://about.ask.com/en/docs/about/webmasters.shtml" - } - , - { - "pattern": "convera", - "instances": [ - "ConveraCrawler/0.9e (+http://ews.converasearch.com/crawl.htm)" - ], - "url": "http://ews.converasearch.com/crawl.htm" - } - , - { - "pattern": "seekbot", - "instances": [ - "Seekbot/1.0 (http://www.seekbot.net/bot.html) RobotsTxtFetcher/1.2" - ], - "url": "http://www.seekbot.net/bot.html" - } - , - { - "pattern": "Gigabot", - "instances": [ - "Gigabot/1.0", - "Gigabot/2.0 (http://www.gigablast.com/spider.html)" - ], - "url": "http://www.gigablast.com/spider.html" - } - , - { - "pattern": "Gigablast", - "instances": [ - "GigablastOpenSource/1.0" - ], - "url": "https://github.com/gigablast/open-source-search-engine" - } - , - { - "pattern": "exabot", - "instances": [ - "Mozilla/5.0 (compatible; Alexabot/1.0; +http://www.alexa.com/help/certifyscan; certifyscan@alexa.com)", - "Mozilla/5.0 (compatible; Exabot PyExalead/3.0; +http://www.exabot.com/go/robot)", - "Mozilla/5.0 (compatible; Exabot-Images/3.0; +http://www.exabot.com/go/robot)", - "Mozilla/5.0 (compatible; Exabot/3.0 (BiggerBetter); +http://www.exabot.com/go/robot)", - "Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot)" - ] - } - , - { - "pattern": "ia_archiver", - "instances": [ - "ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)", - "ia_archiver-web.archive.org" - ] - } - , - { - "pattern": "GingerCrawler", - "instances": [ - "GingerCrawler/1.0 (Language Assistant for Dyslexics; www.gingersoftware.com/crawler_agent.htm; support at ginger software dot com)" - ] - } - , - { - "pattern": "webmon ", - "instances": [] - } - , - { - "pattern": "HTTrack", - "instances": [ - "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)" - ] - } - , - { - "pattern": "grub.org", - "instances": [ - "Mozilla/4.0 (compatible; grub-client-0.3.0; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.0.4; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.0.5; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.0.6; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.0.7; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.1.1; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.2.1; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.3.1; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.3.7; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.4.3; Crawl your own stuff with http://grub.org)", - "Mozilla/4.0 (compatible; grub-client-1.5.3; Crawl your own stuff with http://grub.org)" - ] - } - , - { - "pattern": "UsineNouvelleCrawler", - "instances": [] - } - , - { - "pattern": "antibot", - "instances": [] - } - , - { - "pattern": "netresearchserver", - "instances": [] - } - , - { - "pattern": "speedy", - "instances": [ - "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Speedy Spider (http://www.entireweb.com/about/search_tech/speedy_spider/)", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Speedy Spider for SpeedyAds (http://www.entireweb.com/about/search_tech/speedy_spider/)", - "Mozilla/5.0 (compatible; Speedy Spider; http://www.entireweb.com/about/search_tech/speedy_spider/)", - "Speedy Spider (Entireweb; Beta/1.2; http://www.entireweb.com/about/search_tech/speedyspider/)", - "Speedy Spider (http://www.entireweb.com/about/search_tech/speedy_spider/)" - ] - } - , - { - "pattern": "fluffy", - "instances": [] - } - , - { - "pattern": "bibnum.bnf", - "instances": [ - "Mozilla/5.0 (compatible; bnf.fr_bot; +http://bibnum.bnf.fr/robot/bnf.html)" - ] - } - , - { - "pattern": "findlink", - "instances": [ - "findlinks/1.0 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.3-beta8 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.3-beta9 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.5-beta7 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.6-beta1 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.6-beta1 (+http://wortschatz.uni-leipzig.de/findlinks/; YaCy 0.1; yacy.net)", - "findlinks/1.1.6-beta2 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.6-beta3 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.6-beta4 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.6-beta5 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/1.1.6-beta6 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.0 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.0.1 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.0.2 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.0.4 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.0.5 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.0.9 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.1 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.1.3 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.1.5 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.2 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.5 (+http://wortschatz.uni-leipzig.de/findlinks/)", - "findlinks/2.6 (+http://wortschatz.uni-leipzig.de/findlinks/)" - ] - } - , - { - "pattern": "msrbot", - "instances": [] - } - , - { - "pattern": "panscient", - "instances": [ - "panscient.com" - ] - } - , - { - "pattern": "yacybot", - "instances": [ - "yacybot (-global; amd64 FreeBSD 9.2-RELEASE-p10; java 1.7.0_65; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 2.6.32-042stab111.11; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 2.6.32-042stab116.1; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.10.0-229.4.2.el7.x86_64; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.10.0-229.4.2.el7.x86_64; java 1.8.0_45; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.13.0-61-generic; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.14.32-xxxx-grs-ipv6-64; java 1.8.0_111; Europe/de) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_75; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.19.0-15-generic; java 1.8.0_45-internal; Europe/de) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.2.0-4-amd64; java 1.7.0_65; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.2.0-4-amd64; java 1.7.0_67; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 4.4.0-57-generic; java 9-internal; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Windows 8 6.2; java 1.7.0_55; Europe/de) http://yacy.net/bot.html", - "yacybot (-global; amd64 Windows 8.1 6.3; java 1.7.0_55; Europe/de) http://yacy.net/bot.html", - "yacybot (/global; amd64 FreeBSD 10.3-RELEASE-p7; java 1.7.0_95; GMT/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 FreeBSD 10.3-RELEASE; java 1.8.0_77; GMT/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 2.6.32-042stab093.4; java 1.7.0_65; Etc/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 2.6.32-042stab094.8; java 1.7.0_79; America/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 2.6.32-042stab108.8; java 1.7.0_91; America/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 2.6.32-573.3.1.el6.x86_64; java 1.7.0_85; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.10.0-229.7.2.el7.x86_64; java 1.8.0_45; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.10.0-327.22.2.el7.x86_64; java 1.7.0_101; Etc/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.11.10-21-desktop; java 1.7.0_51; America/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.12.1; java 1.7.0_65; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-042stab093.4; java 1.7.0_79; Europe/de) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-042stab093.4; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-45-generic; java 1.7.0_75; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-74-generic; java 1.7.0_91; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-83-generic; java 1.7.0_95; Europe/de) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-83-generic; java 1.7.0_95; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-85-generic; java 1.7.0_101; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-85-generic; java 1.7.0_95; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.13.0-88-generic; java 1.7.0_101; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.14-0.bpo.1-amd64; java 1.7.0_55; Europe/de) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.14.32-xxxx-grs-ipv6-64; java 1.7.0_75; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16-0.bpo.2-amd64; java 1.7.0_65; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_111; Europe/de) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_75; America/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_75; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_79; Europe/de) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_91; Europe/de) http://yacy.net/bot.html", - "yacybot (-global; amd64 FreeBSD 9.2-RELEASE-p10; java 1.7.0_65; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 2.6.32-042stab111.11; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 2.6.32-042stab116.1; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (-global; amd64 Linux 3.10.0-229.4.2.el7.x86_64; java 1.7.0_79; Europe/en) http://yacy.net/bot.html", - "yacybot (/global; amd64 Linux 3.16.0-4-amd64; java 1.7.0_95; Europe/en) http://yacy.net/bot.html" - ] - } - , - { - "pattern": "AISearchBot", - "instances": [] - } - , - { - "pattern": "ips-agent", - "instances": [ - "BlackBerry9000/4.6.0.167 Profile/MIDP-2.0 Configuration/CLDC-1.1 VendorID/102 ips-agent", - "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.12; ips-agent) Gecko/20050922 Fedora/1.0.7-1.1.fc4 Firefox/1.0.7", - "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.3; ips-agent) Gecko/20090824 Fedora/1.0.7-1.1.fc4 Firefox/3.5.3", - "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.24; ips-agent) Gecko/20111107 Ubuntu/10.04 (lucid) Firefox/3.6.24", - "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:14.0; ips-agent) Gecko/20100101 Firefox/14.0.1" - ] - } - , - { - "pattern": "tagoobot", - "instances": [] - } - , - { - "pattern": "MJ12bot", - "instances": [ - "MJ12bot/v1.2.0 (http://majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.2.1; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.2.3; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.2.4; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.2.5; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.3.0; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.3.1; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.3.2; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.3.3; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.0; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.1; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.2; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.3; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.4 (domain ownership verifier); http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.4; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.5; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.6; http://mj12bot.com/)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.7; http://mj12bot.com/)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.7; http://www.majestic12.co.uk/bot.php?+)", - "Mozilla/5.0 (compatible; MJ12bot/v1.4.8; http://mj12bot.com/)" - ] - } - , - { - "pattern": "woriobot", - "instances": [ - "Mozilla/5.0 (compatible; woriobot +http://worio.com)", - "Mozilla/5.0 (compatible; woriobot support [at] zite [dot] com +http://zite.com)" - ] - } - , - { - "pattern": "yanga", - "instances": [ - "Yanga WorldSearch Bot v1.1/beta (http://www.yanga.co.uk/)" - ] - } - , - { - "pattern": "buzzbot", - "instances": [ - "Buzzbot/1.0 (Buzzbot; http://www.buzzstream.com; buzzbot@buzzstream.com)" - ] - } - , - { - "pattern": "mlbot", - "instances": [ - "MLBot (www.metadatalabs.com/mlbot)" - ] - } - , - { - "pattern": "YandexBot", - "url": "http://yandex.com/bots", - "instances": [ - "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" - ], - "addition_date": "2015/04/14" - } - , - { - "pattern": "yandex.com\\/bots", - "url": "https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.xml#robot-in-logs", - "instances": [ - "Mozilla/5.0 (compatible; YandexWebmaster/2.0; +http://yandex.com/bots)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B411 Safari/600.1.4 (compatible; YandexMobileBot/3.0; +http://yandex.com/bots)" - ], - "addition_date": "2016/12/01" - } - , - { - "pattern": "purebot", - "addition_date": "2010/01/19", - "instances": [] - } - , - { - "pattern": "Linguee Bot", - "addition_date": "2010/01/26", - "url": "http://www.linguee.com/bot", - "instances": [ - "Linguee Bot (http://www.linguee.com/bot)", - "Linguee Bot (http://www.linguee.com/bot; bot@linguee.com)" - ] - } - , - { - "pattern": "CyberPatrol", - "addition_date": "2010/02/11", - "url": "http://www.cyberpatrol.com/cyberpatrolcrawler.asp", - "instances": [ - "CyberPatrol SiteCat Webbot (http://www.cyberpatrol.com/cyberpatrolcrawler.asp)" - ] - } - , - { - "pattern": "voilabot", - "addition_date": "2010/05/18", - "instances": [ - "Mozilla/5.0 (Windows NT 5.1; U; Win64; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com)", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; fr; rv:1.8.1) VoilaBot BETA 1.2 (support.voilabot@orange-ftgroup.com)", - "Mozilla/5.0 (compatible; OrangeBot/2.0; support.voilabot@orange.com)" - ] - } - , - { - "pattern": "Baiduspider", - "addition_date": "2010/07/15", - "url": "http://www.baidu.jp/spider/", - "instances": [ - "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - ] - } - , - { - "pattern": "citeseerxbot", - "addition_date": "2010/07/17", - "instances": [] - } - , - { - "pattern": "spbot", - "addition_date": "2010/07/31", - "url": "http://www.seoprofiler.com/bot", - "instances": [ - "Mozilla/5.0 (compatible; spbot/1.0; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/1.1; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/1.2; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/2.0.1; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/2.0.2; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/2.0.3; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/2.0.4; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/2.0; +http://www.seoprofiler.com/bot/ )", - "Mozilla/5.0 (compatible; spbot/2.1; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/3.0; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/3.1; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.1; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.2; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.3; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.4; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.5; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.6; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.7; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.7; +https://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.8; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.0.9; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.0; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0a; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.0b; +http://www.seoprofiler.com/bot )", - "Mozilla/5.0 (compatible; spbot/4.1.0; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.2.0; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.3.0; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.4.0; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.4.1; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/4.4.2; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/5.0.1; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/5.0.2; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/5.0.3; +http://OpenLinkProfiler.org/bot )", - "Mozilla/5.0 (compatible; spbot/5.0; +http://OpenLinkProfiler.org/bot )" - ] - } - , - { - "pattern": "twengabot", - "addition_date": "2010/08/03", - "url": "http://www.twenga.com/bot.html", - "instances": [] - } - , - { - "pattern": "postrank", - "addition_date": "2010/08/03", - "url": "http://www.postrank.com", - "instances": [ - "PostRank/2.0 (postrank.com)", - "PostRank/2.0 (postrank.com; 1 subscribers)" - ] - } - , - { - "pattern": "turnitinbot", - "addition_date": "2010/09/26", - "url": "http://www.turnitin.com", - "instances": [] - } - , - { - "pattern": "scribdbot", - "addition_date": "2010/09/28", - "url": "http://www.scribd.com", - "instances": [] - } - , - { - "pattern": "page2rss", - "addition_date": "2010/10/07", - "url": "http://www.page2rss.com", - "instances": [ - "Mozilla/5.0 (compatible; Page2RSS/0.7; +http://page2rss.com/)" - ] - } - , - { - "pattern": "sitebot", - "addition_date": "2010/12/15", - "url": "http://www.sitebot.org", - "instances": [ - "Mozilla/5.0 (compatible; Whoiswebsitebot/0.1; +http://www.whoiswebsite.net)" - ] - } - , - { - "pattern": "linkdex", - "addition_date": "2011/01/06", - "url": "http://www.linkdex.com", - "instances": [ - "Mozilla/5.0 (compatible; linkdexbot/2.0; +http://www.linkdex.com/about/bots/)", - "Mozilla/5.0 (compatible; linkdexbot/2.0; +http://www.linkdex.com/bots/)", - "Mozilla/5.0 (compatible; linkdexbot/2.1; +http://www.linkdex.com/about/bots/)", - "Mozilla/5.0 (compatible; linkdexbot/2.1; +http://www.linkdex.com/bots/)", - "Mozilla/5.0 (compatible; linkdexbot/2.2; +http://www.linkdex.com/bots/)", - "linkdex.com/v2.0", - "linkdexbot/Nutch-1.0-dev (http://www.linkdex.com/; crawl at linkdex dot com)" - ] - } - , - { - "pattern": "Adidxbot", - "url": "http://onlinehelp.microsoft.com/en-us/bing/hh204496.aspx", - "instances": [] - } - , - { - "pattern": "blekkobot", - "url": "http://blekko.com/about/blekkobot", - "instances": [ - "Mozilla/5.0 (compatible; Blekkobot; ScoutJet; +http://blekko.com/about/blekkobot)" - ] - } - , - { - "pattern": "ezooms", - "addition_date": "2011/04/27", - "url": "http://www.phpbb.com/community/viewtopic.php?f=64&t=935605&start=450#p12948289", - "instances": [ - "Mozilla/5.0 (compatible; Ezooms/1.0; ezooms.bot@gmail.com)" - ] - } - , - { - "pattern": "dotbot", - "addition_date": "2011/04/27", - "instances": [ - "Mozilla/5.0 (compatible; DotBot/1.1; http://www.opensiteexplorer.org/dotbot, help@moz.com)", - "dotbot" - ] - } - , - { - "pattern": "Mail.RU_Bot", - "addition_date": "2011/04/27", - "instances": [ - "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/", - "Mozilla/5.0 (compatible; Mail.RU_Bot/2.0; +http://go.mail.ru/" - ] - } - , - { - "pattern": "discobot", - "addition_date": "2011/05/03", - "url": "http://discoveryengine.com/discobot.html", - "instances": [ - "Mozilla/5.0 (compatible; discobot/1.0; +http://discoveryengine.com/discobot.html)", - "Mozilla/5.0 (compatible; discobot/2.0; +http://discoveryengine.com/discobot.html)", - "mozilla/5.0 (compatible; discobot/1.1; +http://discoveryengine.com/discobot.html)" - ] - } - , - { - "pattern": "heritrix", - "addition_date": "2011/06/21", - "url": "http://crawler.archive.org/", - "instances": [ - "Mozilla/5.0 (compatible; archive.org_bot/heritrix-1.15.4 +http://www.archive.org)", - "Mozilla/5.0 (compatible; heritrix/1.12.1 +http://www.webarchiv.cz)", - "Mozilla/5.0 (compatible; heritrix/1.12.1b +http://netarkivet.dk/website/info.html)", - "Mozilla/5.0 (compatible; heritrix/1.14.2 +http://rjpower.org)", - "Mozilla/5.0 (compatible; heritrix/1.14.2 +http://www.webarchiv.cz)", - "Mozilla/5.0 (compatible; heritrix/1.14.3 +http://archive.org)", - "Mozilla/5.0 (compatible; heritrix/1.14.3 +http://www.accelobot.com)", - "Mozilla/5.0 (compatible; heritrix/1.14.3 +http://www.webarchiv.cz)", - "Mozilla/5.0 (compatible; heritrix/1.14.3.r6601 +http://www.buddybuzz.net/yptrino)", - "Mozilla/5.0 (compatible; heritrix/1.14.4 +http://parsijoo.ir)", - "Mozilla/5.0 (compatible; heritrix/1.14.4 +http://www.exif-search.com)", - "Mozilla/5.0 (compatible; heritrix/2.0.2 +http://aihit.com)", - "Mozilla/5.0 (compatible; heritrix/2.0.2 +http://seekda.com)", - "Mozilla/5.0 (compatible; heritrix/3.0.0-SNAPSHOT-20091120.021634 +http://crawler.archive.org)", - "Mozilla/5.0 (compatible; heritrix/3.1.0-RC1 +http://boston.lti.cs.cmu.edu/crawler_12/)", - "Mozilla/5.0 (compatible; heritrix/3.1.1 +http://places.tomtom.com/crawlerinfo)", - "Mozilla/5.0 (compatible; heritrix/3.1.1 +http://www.mixdata.com)", - "Mozilla/5.0 (compatible; heritrix/3.1.1-SNAPSHOT-20120116.200628 +http://www.archive.org/details/archive.org_bot)", - "Mozilla/5.0 (compatible; heritrix/3.1.1; UniLeipzigASV +http://corpora.informatik.uni-leipzig.de/crawler_faq.html)", - "Mozilla/5.0 (compatible; heritrix/3.2.0 +http://www.crim.ca)", - "Mozilla/5.0 (compatible; heritrix/3.2.0 +http://www.exif-search.com)", - "Mozilla/5.0 (compatible; heritrix/3.2.0 +http://www.mixdata.com)", - "Mozilla/5.0 (compatible; heritrix/3.3.0-SNAPSHOT-20140702-2247 +http://archive.org/details/archive.org_bot)", - "Mozilla/5.0 (compatible; heritrix/3.3.0-SNAPSHOT-20160309-0050; UniLeipzigASV +http://corpora.informatik.uni-leipzig.de/crawler_faq.html)", - "Mozilla/5.0 (compatible; sukibot_heritrix/3.1.1 +http://suki.ling.helsinki.fi/eng/webmasters.html)" - ] - } - , - { - "pattern": "findthatfile", - "addition_date": "2011/06/21", - "url": "http://www.findthatfile.com/", - "instances": [] - } - , - { - "pattern": "europarchive.org", - "addition_date": "2011/06/21", - "url": "", - "instances": [ - "Mozilla/5.0 (compatible; MSIE 7.0 +http://www.europarchive.org)" - ] - } - , - { - "pattern": "NerdByNature.Bot", - "addition_date": "2011/07/12", - "url": "http://www.nerdbynature.net/bot", - "instances": [ - "Mozilla/5.0 (compatible; NerdByNature.Bot; http://www.nerdbynature.net/bot)" - ] - } - , - { - "pattern": "sistrix crawler", - "addition_date": "2011/08/02", - "instances": [] - } - , - { - "pattern": "Ahrefs(Bot|SiteAudit)", - "addition_date": "2011/08/28", - "instances": [ - "Mozilla/5.0 (compatible; AhrefsBot/5.2; News; +http://ahrefs.com/robot/)", - "Mozilla/5.0 (compatible; AhrefsSiteAudit/5.2; +http://ahrefs.com/robot/)" - ] - } - , - { - "pattern": "fuelbot", - "addition_date": "2018/06/28", - "instances": [ - "fuelbot" - ] - } - , - { - "pattern": "CrunchBot", - "addition_date": "2018/06/28", - "instances": [ - "CrunchBot/1.0 (+http://www.leadcrunch.com/crunchbot)" - ] - } - , - { - "pattern": "centurybot9", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; Go-http-client/1.1; +centurybot9@gmail.com)" - ] - } - , - { - "pattern": "IndeedBot", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (Windows NT 6.1; rv:38.0) Gecko/20100101 Firefox/38.0 (IndeedBot 1.1)" - ] - } - , - { - "pattern": "mappydata", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; Mappy/1.0; +http://mappydata.net/bot/)" - ] - } - , - { - "pattern": "woobot", - "addition_date": "2018/06/28", - "instances": [ - "woobot" - ] - } - , - { - "pattern": "ZoominfoBot", - "addition_date": "2018/06/28", - "instances": [ - "ZoominfoBot (zoominfobot at zoominfo dot com)" - ] - } - , - { - "pattern": "PrivacyAwareBot", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; PrivacyAwareBot/1.1; +http://www.privacyaware.org)" - ] - } - , - { - "pattern": "Multiviewbot", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Multiviewbot" - ] - } - , - { - "pattern": "SWIMGBot", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 SWIMGBot" - ] - } - , - { - "pattern": "Grobbot", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; Grobbot/2.2; +https://grob.it)" - ] - } - , - { - "pattern": "eright", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; eright/1.0; +bot@eright.com)" - ] - } - , - { - "pattern": "Apercite", - "addition_date": "2018/06/28", - "instances": [ - "Mozilla/5.0 (compatible; Apercite; +http://www.apercite.fr/robot/index.html)" - ] - } - , - { - "pattern": "semanticbot", - "addition_date": "2018/06/28", - "instances": [ - "semanticbot", - "semanticbot (info@semanticaudience.com)" - ] - } - , - { - "pattern": "Aboundex", - "addition_date": "2011/09/28", - "url": "http://www.aboundex.com/crawler/", - "instances": [ - "Aboundex/0.2 (http://www.aboundex.com/crawler/)", - "Aboundex/0.3 (http://www.aboundex.com/crawler/)" - ] - } - , - { - "pattern": "domaincrawler", - "addition_date": "2011/10/21", - "instances": [ - "CipaCrawler/3.0 (info@domaincrawler.com; http://www.domaincrawler.com/www.example.com)" - ] - } - , - { - "pattern": "wbsearchbot", - "addition_date": "2011/12/21", - "url": "http://www.warebay.com/bot.html", - "instances": [] - } - , - { - "pattern": "summify", - "addition_date": "2012/01/04", - "url": "http://summify.com", - "instances": [ - "Summify (Summify/1.0.1; +http://summify.com)" - ] - } - , - { - "pattern": "CCBot", - "addition_date": "2012/02/05", - "url": "http://www.commoncrawl.org/bot.html", - "instances": [ - "CCBot/2.0 (http://commoncrawl.org/faq/)" - ] - } - , - { - "pattern": "edisterbot", - "addition_date": "2012/02/25", - "instances": [] - } - , - { - "pattern": "seznambot", - "addition_date": "2012/03/14", - "instances": [ - "Mozilla/5.0 (compatible; SeznamBot/3.2-test1-1; +http://napoveda.seznam.cz/en/seznambot-intro/)", - "Mozilla/5.0 (compatible; SeznamBot/3.2-test1; +http://napoveda.seznam.cz/en/seznambot-intro/)", - "Mozilla/5.0 (compatible; SeznamBot/3.2-test2; +http://napoveda.seznam.cz/en/seznambot-intro/)", - "Mozilla/5.0 (compatible; SeznamBot/3.2-test4; +http://napoveda.seznam.cz/en/seznambot-intro/)", - "Mozilla/5.0 (compatible; SeznamBot/3.2; +http://napoveda.seznam.cz/en/seznambot-intro/)" - ] - } - , - { - "pattern": "ec2linkfinder", - "addition_date": "2012/03/22", - "instances": [ - "ec2linkfinder" - ] - } - , - { - "pattern": "gslfbot", - "addition_date": "2012/04/03", - "instances": [] - } - , - { - "pattern": "aiHitBot", - "addition_date": "2012/04/16", - "instances": [ - "Mozilla/5.0 (compatible; aiHitBot/2.9; +https://www.aihitdata.com/about)" - ] - } - , - { - "pattern": "intelium_bot", - "addition_date": "2012/05/07", - "instances": [] - } - , - { - "pattern": "facebookexternalhit", - "addition_date": "2012/05/07", - "instances": [ - "facebookexternalhit/1.0 (+http://www.facebook.com/externalhit_uatext.php)", - "facebookexternalhit/1.1", - "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)" - ] - } - , - { - "pattern": "Yeti", - "addition_date": "2012/05/07", - "url": "http://naver.me/bot", - "instances": [ - "Mozilla/5.0 (compatible; Yeti/1.1; +http://naver.me/bot)" - ] - } - , - { - "pattern": "RetrevoPageAnalyzer", - "addition_date": "2012/05/07", - "instances": [ - "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; RetrevoPageAnalyzer; +http://www.retrevo.com/content/about-us)" - ] - } - , - { - "pattern": "lb-spider", - "addition_date": "2012/05/07", - "instances": [] - } - , - { - "pattern": "Sogou", - "addition_date": "2012/05/13", - "url": "http://www.sogou.com/docs/help/webmasters.htm#07", - "instances": [ - "Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", - "Sogou Pic Spider/3.0(+http://www.sogou.com/docs/help/webmasters.htm#07)", - "Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)" - ] - } - , - { - "pattern": "lssbot", - "addition_date": "2012/05/15", - "instances": [] - } - , - { - "pattern": "careerbot", - "addition_date": "2012/05/23", - "url": "http://www.career-x.de/bot.html", - "instances": [] - } - , - { - "pattern": "wotbox", - "addition_date": "2012/06/12", - "url": "http://www.wotbox.com", - "instances": [ - "Wotbox/2.0 (bot@wotbox.com; http://www.wotbox.com)", - "Wotbox/2.01 (+http://www.wotbox.com/bot/)" - ] - } - , - { - "pattern": "wocbot", - "addition_date": "2012/07/25", - "url": "http://www.wocodi.com/crawler", - "instances": [] - } - , - { - "pattern": "ichiro", - "addition_date": "2012/08/28", - "url": "http://help.goo.ne.jp/help/article/1142", - "instances": [ - "DoCoMo/2.0 P900i(c100;TB;W24H11) (compatible; ichiro/mobile goo; +http://help.goo.ne.jp/help/article/1142/)", - "DoCoMo/2.0 P900i(c100;TB;W24H11) (compatible; ichiro/mobile goo; +http://search.goo.ne.jp/option/use/sub4/sub4-1/)", - "DoCoMo/2.0 P900i(c100;TB;W24H11) (compatible; ichiro/mobile goo;+http://search.goo.ne.jp/option/use/sub4/sub4-1/)", - "DoCoMo/2.0 P900i(c100;TB;W24H11)(compatible; ichiro/mobile goo;+http://help.goo.ne.jp/door/crawler.html)", - "DoCoMo/2.0 P901i(c100;TB;W24H11) (compatible; ichiro/mobile goo; +http://help.goo.ne.jp/door/crawler.html)", - "KDDI-CA31 UP.Browser/6.2.0.7.3.129 (GUI) MMP/2.0 (compatible; ichiro/mobile goo; +http://help.goo.ne.jp/help/article/1142/)", - "KDDI-CA31 UP.Browser/6.2.0.7.3.129 (GUI) MMP/2.0 (compatible; ichiro/mobile goo; +http://search.goo.ne.jp/option/use/sub4/sub4-1/)", - "KDDI-CA31 UP.Browser/6.2.0.7.3.129 (GUI) MMP/2.0 (compatible; ichiro/mobile goo;+http://search.goo.ne.jp/option/use/sub4/sub4-1/)", - "ichiro/2.0 (http://help.goo.ne.jp/door/crawler.html)", - "ichiro/2.0 (ichiro@nttr.co.jp)", - "ichiro/3.0 (http://help.goo.ne.jp/door/crawler.html)", - "ichiro/3.0 (http://help.goo.ne.jp/help/article/1142)", - "ichiro/3.0 (http://search.goo.ne.jp/option/use/sub4/sub4-1/)", - "ichiro/4.0 (http://help.goo.ne.jp/door/crawler.html)", - "ichiro/5.0 (http://help.goo.ne.jp/door/crawler.html)" - ] - } - , - { - "pattern": "DuckDuckBot", - "addition_date": "2012/09/19", - "url": "http://duckduckgo.com/duckduckbot.html", - "instances": [ - "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", - "DuckDuckBot/1.1; (+http://duckduckgo.com/duckduckbot.html)" - ] - } - , - { - "pattern": "lssrocketcrawler", - "addition_date": "2012/09/24", - "instances": [] - } - , - { - "pattern": "drupact", - "addition_date": "2012/09/27", - "url": "http://www.arocom.de/drupact", - "instances": [ - "drupact/0.7; http://www.arocom.de/drupact" - ] - } - , - { - "pattern": "webcompanycrawler", - "addition_date": "2012/10/03", - "instances": [] - } - , - { - "pattern": "acoonbot", - "addition_date": "2012/10/07", - "url": "http://www.acoon.de/robot.asp", - "instances": [] - } - , - { - "pattern": "openindexspider", - "addition_date": "2012/10/26", - "url": "http://www.openindex.io/en/webmasters/spider.html", - "instances": [] - } - , - { - "pattern": "gnam gnam spider", - "addition_date": "2012/10/31", - "instances": [] - } - , - { - "pattern": "web-archive-net.com.bot", - "instances": [] - } - , - { - "pattern": "backlinkcrawler", - "addition_date": "2013/01/04", - "instances": [] - } - , - { - "pattern": "coccoc", - "addition_date": "2013/01/04", - "url": "http://help.coccoc.vn/", - "instances": [ - "Mozilla/5.0 (compatible; coccoc/1.0; +http://help.coccoc.com/)", - "Mozilla/5.0 (compatible; coccoc/1.0; +http://help.coccoc.com/searchengine)", - "Mozilla/5.0 (compatible; coccocbot-image/1.0; +http://help.coccoc.com/searchengine)", - "Mozilla/5.0 (compatible; coccocbot-web/1.0; +http://help.coccoc.com/searchengine)", - "Mozilla/5.0 (compatible; image.coccoc/1.0; +http://help.coccoc.com/)", - "Mozilla/5.0 (compatible; imagecoccoc/1.0; +http://help.coccoc.com/)", - "Mozilla/5.0 (compatible; imagecoccoc/1.0; +http://help.coccoc.com/searchengine)", - "coccoc", - "coccoc/1.0 ()", - "coccoc/1.0 (http://help.coccoc.com/)", - "coccoc/1.0 (http://help.coccoc.vn/)" - ] - } - , - { - "pattern": "integromedb", - "addition_date": "2013/01/10", - "url": "http://www.integromedb.org/Crawler", - "instances": [ - "www.integromedb.org/Crawler" - ] - } - , - { - "pattern": "content crawler spider", - "addition_date": "2013/01/11", - "instances": [] - } - , - { - "pattern": "toplistbot", - "addition_date": "2013/02/05", - "instances": [] - } - , - { - "pattern": "it2media-domain-crawler", - "addition_date": "2013/03/12", - "instances": [ - "it2media-domain-crawler/1.0 on crawler-prod.it2media.de", - "it2media-domain-crawler/2.0" - ] - } - , - { - "pattern": "ip-web-crawler.com", - "addition_date": "2013/03/22", - "instances": [] - } - , - { - "pattern": "siteexplorer.info", - "addition_date": "2013/05/01", - "instances": [ - "Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)", - "Mozilla/5.0 (compatible; SiteExplorer/1.1b; +http://siteexplorer.info/Backlink-Checker-Spider/)" - ] - } - , - { - "pattern": "elisabot", - "addition_date": "2013/06/27", - "instances": [] - } - , - { - "pattern": "proximic", - "addition_date": "2013/09/12", - "url": "http://www.proximic.com/info/spider.php", - "instances": [ - "Mozilla/5.0 (compatible; proximic; +http://www.proximic.com)", - "Mozilla/5.0 (compatible; proximic; +http://www.proximic.com/info/spider.php)" - ] - } - , - { - "pattern": "changedetection", - "addition_date": "2013/09/13", - "url": "http://www.changedetection.com/bot.html", - "instances": [ - "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; http://www.changedetection.com/bot.html )" - ] - } - , - { - "pattern": "arabot", - "addition_date": "2013/10/09", - "instances": [] - } - , - { - "pattern": "WeSEE:Search", - "addition_date": "2013/11/18", - "instances": [ - "WeSEE:Search", - "WeSEE:Search/0.1 (Alpha, http://www.wesee.com/en/support/bot/)" - ] - } - , - { - "pattern": "niki-bot", - "addition_date": "2014/01/01", - "instances": [] - } - , - { - "pattern": "CrystalSemanticsBot", - "addition_date": "2014/02/17", - "url": "http://www.crystalsemantics.com/user-agent/", - "instances": [] - } - , - { - "pattern": "rogerbot", - "addition_date": "2014/02/28", - "url": "http://moz.com/help/pro/what-is-rogerbot-", - "instances": [ - "Mozilla/5.0 (compatible; rogerBot/1.0; UrlCrawler; http://www.seomoz.org/dp/rogerbot)", - "rogerbot/1.0 (http://moz.com/help/pro/what-is-rogerbot-, rogerbot-crawler+partager@moz.com)", - "rogerbot/1.0 (http://moz.com/help/pro/what-is-rogerbot-, rogerbot-crawler+shiny@moz.com)", - "rogerbot/1.0 (http://moz.com/help/pro/what-is-rogerbot-, rogerbot-wherecat@moz.com", - "rogerbot/1.0 (http://moz.com/help/pro/what-is-rogerbot-, rogerbot-wherecat@moz.com)", - "rogerbot/1.0 (http://www.moz.com/dp/rogerbot, rogerbot-crawler@moz.com)", - "rogerbot/1.0 (http://www.seomoz.org/dp/rogerbot, rogerbot-crawler+shiny@seomoz.org)", - "rogerbot/1.0 (http://www.seomoz.org/dp/rogerbot, rogerbot-crawler@seomoz.org)", - "rogerbot/1.0 (http://www.seomoz.org/dp/rogerbot, rogerbot-wherecat@moz.com)", - "rogerbot/1.1 (http://moz.com/help/guides/search-overview/crawl-diagnostics#more-help, rogerbot-crawler+pr2-crawler-05@moz.com)", - "rogerbot/1.1 (http://moz.com/help/guides/search-overview/crawl-diagnostics#more-help, rogerbot-crawler+pr4-crawler-11@moz.com)", - "rogerbot/1.1 (http://moz.com/help/guides/search-overview/crawl-diagnostics#more-help, rogerbot-crawler+pr4-crawler-15@moz.com)", - "rogerbot/1.2 (http://moz.com/help/pro/what-is-rogerbot-, rogerbot-crawler+phaser-testing-crawler-01@moz.com)" - ] - } - , - { - "pattern": "360Spider", - "addition_date": "2014/03/14", - "url": "http://needs-be.blogspot.co.uk/2013/02/how-to-block-spider360.html", - "instances": [ - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1; 360Spider", - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1; 360Spider(compatible; HaosouSpider; http://www.haosou.com/help/help_3_2.html)", - "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 QIHU 360SE; 360Spider", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; ) Firefox/1.5.0.11; 360Spider", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.0.11) Firefox/1.5.0.11; 360Spider", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.0.11) Firefox/1.5.0.11 360Spider;", - "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.0.11) Gecko/20070312 Firefox/1.5.0.11; 360Spider", - "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0); 360Spider", - "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0); 360Spider(compatible; HaosouSpider; http://www.haosou.com/help/help_3_2.html)" - ] - } - , - { - "pattern": "psbot", - "addition_date": "2014/03/31", - "url": "http://www.picsearch.com/bot.html", - "instances": [ - "psbot-image (+http://www.picsearch.com/bot.html)", - "psbot-page (+http://www.picsearch.com/bot.html)", - "psbot/0.1 (+http://www.picsearch.com/bot.html)" - ] - } - , - { - "pattern": "InterfaxScanBot", - "addition_date": "2014/03/31", - "url": "http://scan-interfax.ru", - "instances": [] - } - , - { - "pattern": "CC Metadata Scaper", - "addition_date": "2014/04/01", - "url": "http://wiki.creativecommons.org/Metadata_Scraper", - "instances": [ - "CC Metadata Scaper http://wiki.creativecommons.org/Metadata_Scraper" - ] - } - , - { - "pattern": "g00g1e.net", - "addition_date": "2014/04/01", - "url": "http://www.g00g1e.net/", - "instances": [] - } - , - { - "pattern": "GrapeshotCrawler", - "addition_date": "2014/04/01", - "url": "http://www.grapeshot.co.uk/crawler.php", - "instances": [ - "Mozilla/5.0 (compatible; GrapeshotCrawler/2.0; +http://www.grapeshot.co.uk/crawler.php)" - ] - } - , - { - "pattern": "urlappendbot", - "addition_date": "2014/05/10", - "url": "http://www.profound.net/urlappendbot.html", - "instances": [ - "Mozilla/5.0 (compatible; URLAppendBot/1.0; +http://www.profound.net/urlappendbot.html)" - ] - } - , - { - "pattern": "brainobot", - "addition_date": "2014/06/24", - "instances": [] - } - , - { - "pattern": "fr-crawler", - "addition_date": "2014/07/31", - "instances": [ - "Mozilla/5.0 (compatible; fr-crawler/1.1)" - ] - } - , - { - "pattern": "binlar", - "addition_date": "2014/09/12", - "instances": [ - "binlar_2.6.3 binlar2.6.3@unspecified.mail", - "binlar_2.6.3 binlar_2.6.3@unspecified.mail", - "binlar_2.6.3 larbin2.6.3@unspecified.mail", - "binlar_2.6.3 phanendra_kalapala@McAfee.com", - "binlar_2.6.3 test@mgmt.mic" - ] - } - , - { - "pattern": "SimpleCrawler", - "addition_date": "2014/09/12", - "instances": [ - "SimpleCrawler/0.1" - ] - } - , - { - "pattern": "Twitterbot", - "addition_date": "2014/09/12", - "url": "https://dev.twitter.com/cards/getting-started", - "instances": [ - "Twitterbot/0.1", - "Twitterbot/1.0" - ] - } - , - { - "pattern": "cXensebot", - "addition_date": "2014/10/05", - "instances": [ - "cXensebot/1.1a" - ], - "url": "http://www.cxense.com/bot.html" - } - , - { - "pattern": "smtbot", - "addition_date": "2014/10/04", - "instances": [ - "Mozilla/5.0 (compatible; SMTBot/1.0; +http://www.similartech.com/smtbot)", - "SMTBot (similartech.com/smtbot)" - ], - "url": "http://www.similartech.com/smtbot" - } - , - { - "pattern": "bnf.fr_bot", - "addition_date": "2014/11/18", - "url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html", - "instances": [ - "Mozilla/5.0 (compatible; bnf.fr_bot; +http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html)" - ] - } - , - { - "pattern": "A6-Indexer", - "addition_date": "2014/12/05", - "url": "http://www.a6corp.com/a6-web-scraping-policy/", - "instances": [ - "A6-Indexer" - ] - } - , - { - "pattern": "ADmantX", - "addition_date": "2014/12/05", - "url": "http://www.admantx.com", - "instances": [ - "ADmantX Platform Semantic Analyzer - ADmantX Inc. - www.admantx.com - support@admantx.com" - ] - } - , - { - "pattern": "Facebot", - "url": "https://developers.facebook.com/docs/sharing/best-practices#crawl", - "addition_date": "2014/12/30", - "instances": [ - "Facebot/1.0" - ] - } - , - { - "pattern": "OrangeBot\\/", - "instances": [ - "Mozilla/5.0 (compatible; OrangeBot/2.0; support.orangebot@orange.com" - ], - "addition_date": "2015/01/12" - } - , - { - "pattern": "memorybot", - "url": "http://mignify.com/bot.htm", - "instances": [ - "Mozilla/5.0 (compatible; memorybot/1.21.14 +http://mignify.com/bot.html)" - ], - "addition_date": "2015/02/01" - } - , - { - "pattern": "AdvBot", - "url": "http://advbot.net/bot.html", - "instances": [ - "Mozilla/5.0 (compatible; AdvBot/2.0; +http://advbot.net/bot.html)" - ], - "addition_date": "2015/02/01" - } - , - { - "pattern": "MegaIndex", - "url": "https://www.megaindex.ru/?tab=linkAnalyze", - "instances": [ - "Mozilla/5.0 (compatible; MegaIndex.ru/2.0; +https://www.megaindex.ru/?tab=linkAnalyze)" - ], - "addition_date": "2015/03/28" - } - , - { - "pattern": "SemanticScholarBot", - "url": "http://s2.allenai.org/bot.html", - "instances": [ - "SemanticScholarBot/1.0 (+http://s2.allenai.org/bot.html)" - ], - "addition_date": "2015/03/28" - } - , - { - "pattern": "ltx71", - "url": "http://ltx71.com/", - "instances": [ - "ltx71 - (http://ltx71.com/)" - ], - "addition_date": "2015/04/04" - } - , - { - "pattern": "nerdybot", - "url": "http://nerdybot.com/", - "instances": [ - "nerdybot" - ], - "addition_date": "2015/04/05" - } - , - { - "pattern": "xovibot", - "url": "http://www.xovibot.net/", - "instances": [ - "Mozilla/5.0 (compatible; XoviBot/2.0; +http://www.xovibot.net/)" - ], - "addition_date": "2015/04/05" - } - , - { - "pattern": "BUbiNG", - "url": "http://law.di.unimi.it/BUbiNG.html", - "instances": [ - "BUbiNG (+http://law.di.unimi.it/BUbiNG.html)" - ], - "addition_date": "2015/04/06" - } - , - { - "pattern": "Qwantify", - "url": "https://www.qwant.com/", - "instances": [ - "Mozilla/5.0 (compatible; Qwantify/2.0n; +https://www.qwant.com/)/*" - ], - "addition_date": "2015/04/06" - } - , - { - "pattern": "archive.org_bot", - "url": "http://www.archive.org/details/archive.org_bot", - "instances": [ - "Mozilla/5.0 (compatible; archive.org_bot +http://www.archive.org/details/archive.org_bot)" - ], - "addition_date": "2015/04/14" - } - , - { - "pattern": "Applebot", - "url": "http://www.apple.com/go/applebot", - "addition_date": "2015/04/15", - "instances": [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1)", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML, like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1; +http://www.apple.com/go/applebot)", - "Mozilla/5.0 (compatible; Applebot/0.3; +http://www.apple.com/go/applebot)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Applebot/0.3; +http://www.apple.com/go/applebot)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12B410 Safari/600.1.4 (Applebot/0.1; +http://www.apple.com/go/applebot)" - ] - } - , - { - "pattern": "TweetmemeBot", - "url": "http://datasift.com/bot.html", - "instances": [ - "Mozilla/5.0 (TweetmemeBot/4.0; +http://datasift.com/bot.html) Gecko/20100101 Firefox/31.0" - ], - "addition_date": "2015/04/15" - } - , - { - "pattern": "crawler4j", - "url": "https://github.com/yasserg/crawler4j", - "instances": [ - "crawler4j (http://code.google.com/p/crawler4j/)" - ], - "addition_date": "2015/05/07" - } - , - { - "pattern": "findxbot", - "url": "http://www.findxbot.com", - "instances": [ - "Mozilla/5.0 (compatible; Findxbot/1.0; +http://www.findxbot.com)" - ], - "addition_date": "2015/05/07" - } - , - { - "pattern": "S[eE][mM]rushBot", - "url": "http://www.semrush.com/bot.html", - "instances": [ - "Mozilla/5.0 (compatible; SemrushBot/0.98~bl; +http://www.semrush.com/bot.html)", - "SEMrushBot" - ], - "addition_date": "2015/05/26" - } - , - { - "pattern": "yoozBot", - "url": "http://yooz.ir", - "instances": [ - "Mozilla/5.0 (compatible; yoozBot-2.2; http://yooz.ir; info@yooz.ir)" - ], - "addition_date": "2015/05/26" - } - , - { - "pattern": "lipperhey", - "url": "http://www.lipperhey.com/", - "instances": [ - "Mozilla/5.0 (compatible; Lipperhey Link Explorer; http://www.lipperhey.com/)", - "Mozilla/5.0 (compatible; Lipperhey SEO Service; http://www.lipperhey.com/)", - "Mozilla/5.0 (compatible; Lipperhey Site Explorer; http://www.lipperhey.com/)", - "Mozilla/5.0 (compatible; Lipperhey-Kaus-Australis/5.0; +https://www.lipperhey.com/en/about/)" - ], - "addition_date": "2015/08/26" - } - , - { - "pattern": "Y!J", - "url": "https://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/~/%E3%82%A6%E3%82%A7%E3%83%96%E3%83%9A%E3%83%BC%E3%82%B8%E3%81%AB%E3%82%A2%E3%82%AF%E3%82%BB%E3%82%B9%E3%81%99%E3%82%8B%E3%82%B7%E3%82%B9%E3%83%86%E3%83%A0%E3%81%AE%E3%83%A6%E3%83%BC%E3%82%B6%E3%83%BC%E3%82%A8%E3%83%BC%E3%82%B8%E3%82%A7%E3%83%B3%E3%83%88%E3%81%AB%E3%81%A4%E3%81%84%E3%81%A6", - "instances": [ - "Y!J-ASR/0.1 crawler (http://www.yahoo-help.jp/app/answers/detail/p/595/a_id/42716/)", - "Y!J-BRJ/YATS crawler (http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html)", - "Y!J-PSC/1.0 crawler (http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html)", - "Y!J-BRW/1.0 crawler (http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html)", - "Mozilla/5.0 (iPhone; Y!J-BRY/YATSH crawler; http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html)", - "Mozilla/5.0 (compatible; Y!J SearchMonkey/1.0 (Y!J-AGENT; http://help.yahoo.co.jp/help/jp/search/indexing/indexing-15.html))" - ], - "addition_date": "2015/05/26" - } - , - { - "pattern": "Domain Re-Animator Bot", - "url": "http://domainreanimator.com", - "instances": [ - "Domain Re-Animator Bot (http://domainreanimator.com) - support@domainreanimator.com" - ], - "addition_date": "2015/04/14" - } - , - { - "pattern": "AddThis", - "url": "https://www.addthis.com", - "instances": [ - "AddThis.com robot tech.support@clearspring.com" - ], - "addition_date": "2015/06/02" - } - , - { - "pattern": "Screaming Frog SEO Spider", - "url": "http://www.screamingfrog.co.uk/seo-spider", - "instances": [ - "Screaming Frog SEO Spider/5.1" - ], - "addition_date": "2016/01/08" - } - , - { - "pattern": "MetaURI", - "url": "http://www.useragentstring.com/MetaURI_id_17683.php", - "instances": [ - "MetaURI API/2.0 +metauri.com" - ], - "addition_date": "2016/01/02" - } - , - { - "pattern": "Scrapy", - "url": "http://scrapy.org/", - "instances": [ - "Scrapy/1.0.3 (+http://scrapy.org)" - ], - "addition_date": "2016/01/02" - } - , - { - "pattern": "Livelap[bB]ot", - "url": "http://site.livelap.com/crawler", - "instances": [ - "LivelapBot/0.2 (http://site.livelap.com/crawler)", - "Livelapbot/0.1" - ], - "addition_date": "2016/01/02" - } - , - { - "pattern": "OpenHoseBot", - "url": "http://www.openhose.org/bot.html", - "instances": [ - "Mozilla/5.0 (compatible; OpenHoseBot/2.1; +http://www.openhose.org/bot.html)" - ], - "addition_date": "2016/01/02" - } - , - { - "pattern": "CapsuleChecker", - "url": "http://www.capsulink.com/about", - "instances": [ - "CapsuleChecker (http://www.capsulink.com/)" - ], - "addition_date": "2016/01/02" - } - , - { - "pattern": "collection@infegy.com", - "url": "http://infegy.com/", - "instances": [ - "Mozilla/5.0 (compatible) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 collection@infegy.com" - ], - "addition_date": "2016/01/03" - } - , - { - "pattern": "IstellaBot", - "url": "http://www.tiscali.it/", - "instances": [ - "Mozilla/5.0 (compatible; IstellaBot/1.23.15 +http://www.tiscali.it/)" - ], - "addition_date": "2016/01/09" - } - , - { - "pattern": "DeuSu\\/", - "addition_date": "2016/01/23", - "url": "https://deusu.de/robot.html", - "instances": [ - "Mozilla/5.0 (compatible; DeuSu/0.1.0; +https://deusu.org)", - "Mozilla/5.0 (compatible; DeuSu/5.0.2; +https://deusu.de/robot.html)" - ] - } - , - { - "pattern": "betaBot", - "addition_date": "2016/01/23", - "instances": [] - } - , - { - "pattern": "Cliqzbot\\/", - "addition_date": "2016/01/23", - "url": "http://cliqz.com/company/cliqzbot", - "instances": [ - "Cliqzbot/0.1 (+http://cliqz.com +cliqzbot@cliqz.com)", - "Cliqzbot/0.1 (+http://cliqz.com/company/cliqzbot)", - "Mozilla/5.0 (compatible; Cliqzbot/0.1 +http://cliqz.com/company/cliqzbot)", - "Mozilla/5.0 (compatible; Cliqzbot/1.0 +http://cliqz.com/company/cliqzbot)" - ] - } - , - { - "pattern": "MojeekBot\\/", - "addition_date": "2016/01/23", - "url": "https://www.mojeek.com/bot.html", - "instances": [ - "MojeekBot/0.2 (archi; http://www.mojeek.com/bot.html)", - "Mozilla/5.0 (compatible; MojeekBot/0.2; http://www.mojeek.com/bot.html#relaunch)", - "Mozilla/5.0 (compatible; MojeekBot/0.2; http://www.mojeek.com/bot.html)", - "Mozilla/5.0 (compatible; MojeekBot/0.5; http://www.mojeek.com/bot.html)", - "Mozilla/5.0 (compatible; MojeekBot/0.6; +https://www.mojeek.com/bot.html)", - "Mozilla/5.0 (compatible; MojeekBot/0.6; http://www.mojeek.com/bot.html)" - ] - } - , - { - "pattern": "netEstate NE Crawler", - "addition_date": "2016/01/23", - "url": "+http://www.website-datenbank.de/", - "instances": [ - "netEstate NE Crawler (+http://www.sengine.info/)", - "netEstate NE Crawler (+http://www.website-datenbank.de/)" - ] - } - , - { - "pattern": "SafeSearch microdata crawler", - "addition_date": "2016/01/23", - "url": "https://safesearch.avira.com", - "instances": [ - "SafeSearch microdata crawler (https://safesearch.avira.com, safesearch-abuse@avira.com)" - ] - } - , - { - "pattern": "Gluten Free Crawler\\/", - "addition_date": "2016/01/23", - "url": "http://glutenfreepleasure.com/", - "instances": [ - "Mozilla/5.0 (compatible; Gluten Free Crawler/1.0; +http://glutenfreepleasure.com/)" - ] - } - , - { - "pattern": "Sonic", - "addition_date": "2016/02/08", - "url": "http://www.yama.info.waseda.ac.jp/~crawler/info.html", - "instances": [ - "Mozilla/5.0 (compatible; RankSonicSiteAuditor/1.0; +https://ranksonic.com/ranksonic_sab.html)", - "Mozilla/5.0 (compatible; Sonic/1.0; http://www.yama.info.waseda.ac.jp/~crawler/info.html)", - "Mozzila/5.0 (compatible; Sonic/1.0; http://www.yama.info.waseda.ac.jp/~crawler/info.html)" - ] - } - , - { - "pattern": "Sysomos", - "addition_date": "2016/02/08", - "url": "http://www.sysomos.com", - "instances": [ - "Mozilla/5.0 (compatible; Sysomos/1.0; +http://www.sysomos.com/; Sysomos)" - ] - } - , - { - "pattern": "Trove", - "addition_date": "2016/02/08", - "url": "http://www.trove.com", - "instances": [] - } - , - { - "pattern": "deadlinkchecker", - "addition_date": "2016/02/08", - "url": "http://www.deadlinkchecker.com", - "instances": [ - "www.deadlinkchecker.com Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36", - "www.deadlinkchecker.com XMLHTTP/1.0", - "www.deadlinkchecker.com XMLHTTP/1.0 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36" - ] - } - , - { - "pattern": "Slack-ImgProxy", - "addition_date": "2016/04/25", - "url": "https://api.slack.com/robots", - "instances": [ - "Slack-ImgProxy (+https://api.slack.com/robots)", - "Slack-ImgProxy 0.59 (+https://api.slack.com/robots)", - "Slack-ImgProxy 0.66 (+https://api.slack.com/robots)", - "Slack-ImgProxy 1.106 (+https://api.slack.com/robots)", - "Slack-ImgProxy 1.138 (+https://api.slack.com/robots)", - "Slack-ImgProxy 149 (+https://api.slack.com/robots)" - ] - } - , - { - "pattern": "Embedly", - "addition_date": "2016/04/25", - "url": "http://support.embed.ly", - "instances": [ - "Embedly +support@embed.ly", - "Mozilla/5.0 (compatible; Embedly/0.2; +http://support.embed.ly/)", - "Mozilla/5.0 (compatible; Embedly/0.2; snap; +http://support.embed.ly/)" - ] - } - , - { - "pattern": "RankActiveLinkBot", - "addition_date": "2016/06/20", - "url": "https://rankactive.com/resources/rankactive-linkbot", - "instances": [ - "Mozilla/5.0 (compatible; RankActiveLinkBot; +https://rankactive.com/resources/rankactive-linkbot)" - ] - } - , - { - "pattern": "iskanie", - "addition_date": "2016/09/02", - "url": "http://www.iskanie.com", - "instances": [ - "iskanie (+http://www.iskanie.com)" - ] - } - , - { - "pattern": "SafeDNSBot", - "addition_date": "2016/09/10", - "url": "https://www.safedns.com/searchbot", - "instances": [ - "SafeDNSBot (https://www.safedns.com/searchbot)" - ] - } - , - { - "pattern": "SkypeUriPreview", - "addition_date": "2016/10/10", - "instances": [ - "Mozilla/5.0 (Windows NT 6.1; WOW64) SkypeUriPreview Preview/0.5" - ] - } - , - { - "pattern": "Veoozbot", - "addition_date": "2016/11/03", - "url": "http://www.veooz.com/veoozbot.html", - "instances": [ - "Mozilla/5.0 (compatible; Veoozbot/1.0; +http://www.veooz.com/veoozbot.html)" - ] - } - , - { - "pattern": "Slackbot", - "addition_date": "2016/11/03", - "url": "https://api.slack.com/robots", - "instances": [ - "Slackbot-LinkExpanding (+https://api.slack.com/robots)", - "Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)" - ] - } - , - { - "pattern": "redditbot", - "addition_date": "2016/11/03", - "url": "http://www.reddit.com/feedback", - "instances": [ - "Mozilla/5.0 (compatible; redditbot/1.0; +http://www.reddit.com/feedback)" - ] - } - , - { - "pattern": "datagnionbot", - "addition_date": "2016/11/03", - "url": "http://www.datagnion.com/bot.html", - "instances": [ - "datagnionbot (+http://www.datagnion.com/bot.html)" - ] - } - , - { - "pattern": "Google-Adwords-Instant", - "addition_date": "2016/11/03", - "url": "http://www.google.com/adsbot.html", - "instances": [ - "Google-Adwords-Instant (+http://www.google.com/adsbot.html)" - ] - } - , - { - "pattern": "adbeat_bot", - "addition_date": "2016/11/04", - "instances": [ - "Mozilla/5.0 (compatible; adbeat_bot; +support@adbeat.com; support@adbeat.com)", - "adbeat_bot" - ] - } - , - { - "pattern": "WhatsApp", - "addition_date": "2016/11/15", - "url": "https://www.whatsapp.com/", - "instances": [ - "WhatsApp", - "WhatsApp/2.12.15/i", - "WhatsApp/2.12.16/i", - "WhatsApp/2.12.17/i", - "WhatsApp/2.12.449 A", - "WhatsApp/2.12.453 A", - "WhatsApp/2.12.510 A", - "WhatsApp/2.12.540 A", - "WhatsApp/2.12.548 A", - "WhatsApp/2.12.555 A", - "WhatsApp/2.12.556 A", - "WhatsApp/2.16.1/i", - "WhatsApp/2.16.13 A", - "WhatsApp/2.16.2/i", - "WhatsApp/2.16.42 A", - "WhatsApp/2.16.57 A" - ] - } - , - { - "pattern": "contxbot", - "addition_date": "2017/02/25", - "instances": [ - "Mozilla/5.0 (compatible;contxbot/1.0)" - ] - } - , - { - "pattern": "pinterest", - "addition_date": "2017/03/03", - "instances": [ - "Pinterest/0.2 (+http://www.pinterest.com/bot.html)" - ], - "url": "http://www.pinterest.com/bot.html" - } - , - { - "pattern": "electricmonk", - "addition_date": "2017/03/04", - "instances": [ - "Mozilla/5.0 (compatible; electricmonk/3.2.0 +https://www.duedil.com/our-crawler/)" - ], - "url": "https://www.duedil.com/our-crawler/" - } - , - { - "pattern": "GarlikCrawler", - "addition_date": "2017/03/18", - "instances": [ - "GarlikCrawler/1.2 (http://garlik.com/, crawler@garlik.com)" - ], - "url": "http://garlik.com/" - } - , - { - "pattern": "BingPreview\\/", - "addition_date": "2017/04/23", - "url": "https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0", - "instances": [ - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534+ (KHTML, like Gecko) BingPreview/1.0b", - "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0; BingPreview/1.0b) like Gecko", - "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; WOW64; Trident/6.0; BingPreview/1.0b)", - "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; WOW64; Trident/5.0; BingPreview/1.0b)", - "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 BingPreview/1.0b" - ] - } - , - { - "pattern": "vebidoobot", - "addition_date": "2017/05/08", - "instances": [ - "Mozilla/5.0 (compatible; vebidoobot/1.0; +https://blog.vebidoo.de/vebidoobot/" - ], - "url": "https://blog.vebidoo.de/vebidoobot/" - } - , - { - "pattern": "FemtosearchBot", - "addition_date": "2017/05/16", - "instances": [ - "Mozilla/5.0 (compatible; FemtosearchBot/1.0; http://femtosearch.com)" - ], - "url": "http://femtosearch.com" - } - , - { - "pattern": "Yahoo Link Preview", - "addition_date": "2017/06/28", - "instances": [ - "Mozilla/5.0 (compatible; Yahoo Link Preview; https://help.yahoo.com/kb/mail/yahoo-link-preview-SLN23615.html)" - ], - "url": "https://help.yahoo.com/kb/mail/yahoo-link-preview-SLN23615.html" - } - , - { - "pattern": "MetaJobBot", - "addition_date": "2017/08/16", - "instances": [ - "Mozilla/5.0 (compatible; MetaJobBot; http://www.metajob.de/crawler)" - ], - "url": "http://www.metajob.de/the/crawler" - } - , - { - "pattern": "DomainStatsBot", - "addition_date": "2017/08/16", - "instances": [ - "DomainStatsBot/1.0 (http://domainstats.io/our-bot)" - ], - "url": "http://domainstats.io/our-bot" - } - , - { - "pattern": "mindUpBot", - "addition_date": "2017/08/16", - "instances": [ - "mindUpBot (datenbutler.de)" - ], - "url": "http://www.datenbutler.de/" - } - , - { - "pattern": "Daum\\/", - "addition_date": "2017/08/16", - "instances": [ - "Mozilla/5.0 (compatible; Daum/4.1; +http://cs.daum.net/faq/15/4118.html?faqId=28966)" - ], - "url": "http://cs.daum.net/faq/15/4118.html?faqId=28966" - } - , - { - "pattern": "Jugendschutzprogramm-Crawler", - "addition_date": "2017/08/16", - "instances": [ - "Jugendschutzprogramm-Crawler; Info: http://www.jugendschutzprogramm.de" - ], - "url": "http://www.jugendschutzprogramm.de" - } - , - { - "pattern": "Xenu Link Sleuth", - "addition_date": "2017/08/19", - "instances": [ - "Xenu Link Sleuth/1.3.8" - ], - "url": "http://home.snafu.de/tilman/xenulink.html" - } - , - { - "pattern": "Pcore-HTTP", - "addition_date": "2017/08/19", - "instances": [ - "Pcore-HTTP/v0.40.3" - ], - "url": "https://bitbucket.org/softvisio/pcore/overview" - } - , - { - "pattern": "moatbot", - "addition_date": "2017/09/16", - "instances": [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36 moatbot", - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4 moatbot" - ], - "url": "https://moat.com" - } - , - { - "pattern": "KosmioBot", - "addition_date": "2017/09/16", - "instances": [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36 (compatible; KosmioBot/1.0; +http://kosm.io/bot.html)" - ], - "url": "http://kosm.io/bot.html" - } - , - { - "pattern": "pingdom", - "addition_date": "2017/09/16", - "instances": [ - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36 PingdomPageSpeed/1.0 (pingbot/2.0; +http://www.pingdom.com/)", - "Mozilla/5.0 (compatible; pingbot/2.0; +http://www.pingdom.com/)" - ], - "url": "http://www.pingdom.com" - } - , - { - "pattern": "PhantomJS", - "addition_date": "2017/09/18", - "instances": [ - "Mozilla/5.0 (Unknown; Linux x86_64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1 bl.uk_lddc_renderbot/2.0.0 (+ http://www.bl.uk/aboutus/legaldeposit/websites/websites/faqswebmaster/index.html)" - ], - "url": "http://phantomjs.org/" - } - , - { - "pattern": "Gowikibot", - "addition_date": "2017/10/26", - "instances": [ - "Mozilla/5.0 (compatible; Gowikibot/1.0; +http://www.gowikibot.com)" - ], - "url": "http://www.gowikibot.com" - } - , - { - "pattern": "PiplBot", - "addition_date": "2017/10/30", - "instances": [ - "Mozilla/5.0+(compatible;+PiplBot;+http://www.pipl.com/bot/)" - ], - "url": "http://www.pipl.com/bot/" - } - , - { - "pattern": "Discordbot", - "addition_date": "2017/09/22", - "url": "https://discordapp.com", - "instances": [ - "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)" - ] - } - , - { - "pattern": "TelegramBot", - "addition_date": "2017/10/01", - "instances": [ - "TelegramBot (like TwitterBot)" - ] - } - , - { - "pattern": "Jetslide", - "addition_date": "2017/09/27", - "url": "http://jetsli.de/crawler", - "instances": [ - "Mozilla/5.0 (compatible; Jetslide; +http://jetsli.de/crawler)" - ] - } - , - { - "pattern": "newsharecounts", - "addition_date": "2017/09/30", - "url": "http://newsharecounts.com/crawler", - "instances": [ - "Mozilla/5.0 (compatible; NewShareCounts.com/1.0; +http://newsharecounts.com/crawler)" - ] - } - , - { - "pattern": "James BOT", - "addition_date": "2017/10/12", - "url": "http://cognitiveseo.com/bot.html", - "instances": [ - "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6 - James BOT - WebCrawler http://cognitiveseo.com/bot.html" - ] - } - , - { - "pattern": "Barkrowler", - "addition_date": "2017/10/09", - "url": "http://www.exensa.com/crawl", - "instances": [ - "Barkrowler/0.5.1 (experimenting / debugging - sorry for your logs ) http://www.exensa.com/crawl - admin@exensa.com -- based on BuBiNG", - "Barkrowler/0.7 (+http://www.exensa.com/crawl)" - ] - } - , - { - "pattern": "TinEye", - "addition_date": "2017/10/14", - "url": "http://www.tineye.com/crawler.html", - "instances": [ - "Mozilla/5.0 (compatible; TinEye-bot/1.31; +http://www.tineye.com/crawler.html)", - "TinEye/1.1 (http://tineye.com/crawler.html)" - ] - } - , - { - "pattern": "SocialRankIOBot", - "addition_date": "2017/10/19", - "url": "http://socialrank.io/about", - "instances": [ - "SocialRankIOBot; http://socialrank.io/about" - ] - } - , - { - "pattern": "trendictionbot", - "addition_date": "2017/10/30", - "url": "http://www.trendiction.de/bot", - "instances": [ - "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.0; trendictionbot0.5.0; trendiction search; http://www.trendiction.de/bot; please let us know of any problems; web at trendiction.com) Gecko/20071127 Firefox/3.0.0.11" - ] - } - , - { - "pattern": "Ocarinabot", - "addition_date": "2017/09/27", - "instances": [ - "Ocarinabot" - ] - } - , - { - "pattern": "epicbot", - "addition_date": "2017/10/31", - "url": "http://www.epictions.com/epicbot", - "instances": [ - "Mozilla/5.0 (compatible; epicbot; +http://www.epictions.com/epicbot)" - ] - } - , - { - "pattern": "Primalbot", - "addition_date": "2017/09/27", - "url": "https://www.primal.com", - "instances": [ - "Mozilla/5.0 (compatible; Primalbot; +https://www.primal.com;)" - ] - } - , - { - "pattern": "DuckDuckGo-Favicons-Bot", - "addition_date": "2017/10/06", - "url": "http://duckduckgo.com", - "instances": [ - "Mozilla/5.0 (compatible; DuckDuckGo-Favicons-Bot/1.0; +http://duckduckgo.com)" - ] - } - , - { - "pattern": "GnowitNewsbot", - "addition_date": "2017/10/30", - "url": "http://www.gnowit.com", - "instances": [ - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0 / GnowitNewsbot / Contact information at http://www.gnowit.com" - ] - } - , - { - "pattern": "Leikibot", - "addition_date": "2017/09/24", - "url": "http://www.leiki.com", - "instances": [ - "Mozilla/5.0 (Windows NT 6.3;compatible; Leikibot/1.0; +http://www.leiki.com)" - ] - } - , - { - "pattern": "LinkArchiver", - "addition_date": "2017/09/24", - "instances": [ - "@LinkArchiver twitter bot" - ] - } - , - { - "pattern": "YaK\\/", - "addition_date": "2017/09/25", - "url": "http://linkfluence.com", - "instances": [ - "Mozilla/5.0 (compatible; YaK/1.0; http://linkfluence.com/; bot@linkfluence.com)" - ] - } - , - { - "pattern": "PaperLiBot", - "addition_date": "2017/09/25", - "url": "http://support.paper.li/entries/20023257-what-is-paper-li", - "instances": [ - "Mozilla/5.0 (compatible; PaperLiBot/2.1; http://support.paper.li/entries/20023257-what-is-paper-li)" - ] - } - , - { - "pattern": "Digg Deeper", - "addition_date": "2017/09/26", - "url": "http://digg.com/about", - "instances": [ - "Digg Deeper/v1 (http://digg.com/about)" - ] - } - , - { - "pattern": "dcrawl", - "addition_date": "2017/09/22", - "instances": [ - "dcrawl/1.0" - ] - } - , - { - "pattern": "Snacktory", - "addition_date": "2017/09/23", - "url": "https://github.com/karussell/snacktory", - "instances": [ - "Mozilla/5.0 (compatible; Snacktory; +https://github.com/karussell/snacktory)" - ] - } - , - { - "pattern": "AndersPinkBot", - "addition_date": "2017/09/24", - "url": "http://anderspink.com/bot.html", - "instances": [ - "Mozilla/5.0 (compatible; AndersPinkBot/1.0; +http://anderspink.com/bot.html)" - ] - } - , - { - "pattern": "Fyrebot", - "addition_date": "2017/09/22", - "instances": [ - "Fyrebot/1.0" - ] - } - , - { - "pattern": "EveryoneSocialBot", - "addition_date": "2017/09/22", - "url": "http://everyonesocial.com", - "instances": [ - "Mozilla/5.0 (compatible; EveryoneSocialBot/1.0; support@everyonesocial.com http://everyonesocial.com/)" - ] - } - , - { - "pattern": "Mediatoolkitbot", - "addition_date": "2017/10/06", - "url": "http://mediatoolkit.com", - "instances": [ - "Mediatoolkitbot (complaints@mediatoolkit.com)" - ] - } - , - { - "pattern": "Luminator-robots", - "addition_date": "2017/09/22", - "instances": [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/30.0.1599.66 Safari/537.13 Luminator-robots/2.0" - ] - } - , - { - "pattern": "ExtLinksBot", - "addition_date": "2017/11/02", - "url": "https://extlinks.com/Bot.html", - "instances": [ - "Mozilla/5.0 (compatible; ExtLinksBot/1.5 +https://extlinks.com/Bot.html)" - ] - } - , - { - "pattern": "SurveyBot", - "addition_date": "2017/11/02", - "instances": [ - "Mozilla/5.0 (Windows; U; Windows NT 5.1; en; rv:1.9.0.13) Gecko/2009073022 Firefox/3.5.2 (.NET CLR 3.5.30729) SurveyBot/2.3 (DomainTools)" - ] - } - , - { - "pattern": "NING\\/", - "addition_date": "2017/11/02", - "instances": [ - "NING/1.0" - ] - } - , - { - "pattern": "okhttp", - "addition_date": "2017/11/02", - "instances": [ - "okhttp/2.5.0", - "okhttp/2.7.5", - "okhttp/3.2.0", - "okhttp/3.5.0" - ] - } - , - { - "pattern": "Nuzzel", - "addition_date": "2017/11/02", - "instances": [ - "Nuzzel" - ] - } - , - { - "pattern": "omgili", - "addition_date": "2017/11/02", - "url": "http://omgili.com", - "instances": [ - "omgili/0.5 +http://omgili.com" - ] - } - , - { - "pattern": "PocketParser", - "addition_date": "2017/11/02", - "url": "https://getpocket.com/pocketparser_ua", - "instances": [ - "PocketParser/2.0 (+https://getpocket.com/pocketparser_ua)" - ] - } - , - { - "pattern": "YisouSpider", - "addition_date": "2017/11/02", - "instances": [ - "YisouSpider" - ] - } - , - { - "pattern": "um-LN", - "addition_date": "2017/11/02", - "instances": [ - "Mozilla/5.0 (compatible; um-LN/1.0; mailto: techinfo@ubermetrics-technologies.com)" - ] - } - , - { - "pattern": "ToutiaoSpider", - "addition_date": "2017/11/02", - "url": "http://web.toutiao.com/media_cooperation/", - "instances": [ - "Mozilla/5.0 (compatible; ToutiaoSpider/1.0; http://web.toutiao.com/media_cooperation/;)" - ] - } - , - { - "pattern": "MuckRack", - "addition_date": "2017/11/02", - "url": "http://muckrack.com", - "instances": [ - "Mozilla/5.0 (compatible; MuckRack/1.0; +http://muckrack.com)" - ] - } - , - { - "pattern": "Jamie's Spider", - "addition_date": "2017/11/02", - "url": "http://jamiembrown.com/", - "instances": [ - "Jamie's Spider (http://jamiembrown.com/)" - ] - } - , - { - "pattern": "AHC\\/", - "addition_date": "2017/11/02", - "instances": [ - "AHC/2.0" - ] - } - , - { - "pattern": "NetcraftSurveyAgent", - "addition_date": "2017/11/02", - "instances": [ - "Mozilla/5.0 (compatible; NetcraftSurveyAgent/1.0; +info@netcraft.com)" - ] - } - , - { - "pattern": "Laserlikebot", - "addition_date": "2017/11/02", - "instances": [ - "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Laserlikebot/0.1)" - ] - } - , - { - "pattern": "Apache-HttpClient", - "addition_date": "2017/11/02", - "instances": [ - "Apache-HttpClient/4.2.3 (java 1.5)", - "Apache-HttpClient/4.2.5 (java 1.5)", - "Apache-HttpClient/4.3.1 (java 1.5)", - "Apache-HttpClient/4.3.3 (java 1.5)", - "Apache-HttpClient/4.3.5 (java 1.5)", - "Apache-HttpClient/4.4.1 (Java/1.8.0_65)", - "Apache-HttpClient/4.5.3 (Java/1.8.0_121)" - ] - } - , - { - "pattern": "AppEngine-Google", - "addition_date": "2017/11/02", - "instances": [ - "AppEngine-Google; (+http://code.google.com/appengine; appid: example)" - ] - } - , - { - "pattern": "Jetty", - "addition_date": "2017/11/02", - "instances": [ - "Jetty/9.3.z-SNAPSHOT" - ] - } - , - { - "pattern": "Upflow", - "addition_date": "2017/11/02", - "instances": [ - "Upflow/1.0" - ] - } - , - { - "pattern": "Thinklab", - "addition_date": "2017/11/02", - "url": "thinklab.com", - "instances": [ - "Thinklab (thinklab.com)" - ] - } - , - { - "pattern": "Traackr.com", - "addition_date": "2017/11/02", - "url": "Traackr.com", - "instances": [ - "Traackr.com" - ] - } - , - { - "pattern": "Twurly", - "addition_date": "2017/11/02", - "url": "http://twurly.org", - "instances": [ - "Ruby, Twurly v1.1 (http://twurly.org)" - ] - } - , - { - "pattern": "Mastodon", - "addition_date": "2017/11/02", - "instances": [ - "http.rb/2.2.2 (Mastodon/1.5.1; +https://example-masto-instance.org/)" - ] - } - , - { - "pattern": "http_get", - "addition_date": "2017/11/02", - "instances": [ - "http_get" - ] - } - , - { - "pattern": "DnyzBot", - "addition_date": "2017/11/20", - "instances": [ - "Mozilla/5.0 (compatible; DnyzBot/1.0)", - "Mozilla/5.0 (compatible; DnyzBot/1.0) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/64.0.3282.167 Safari/537.36", - "Mozilla/5.0 (compatible; DnyzBot/1.0) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/64.0.3264.0 Safari/537.36" - ] - } - , - { - "pattern": "botify", - "addition_date": "2018/02/01", - "instances": [ - "Mozilla/5.0 (compatible; botify; http://botify.com)" - ] - } - , - { - "pattern": "007ac9 Crawler", - "addition_date": "2018/02/09", - "instances": [ - "Mozilla/5.0 (compatible; 007ac9 Crawler; http://crawler.007ac9.net/)" - ] - } - , - { - "pattern": "BehloolBot", - "addition_date": "2018/02/09", - "instances": [ - "Mozilla/5.0 (compatible; BehloolBot/beta; +http://www.webeaver.com/bot)" - ] - } - , - { - "pattern": "BrandVerity", - "addition_date": "2018/02/27", - "instances": [ - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:41.0) Gecko/20100101 Firefox/55.0 BrandVerity/1.0 (http://www.brandverity.com/why-is-brandverity-visiting-me)" - ] - } - , - { - "pattern": "check_http", - "addition_date": "2018/02/09", - "instances": [ - "check_http/v2.2.1 (nagios-plugins 2.2.1)" - ] - } - , - { - "pattern": "BDCbot", - "addition_date": "2018/02/09", - "instances": [ - "Mozilla/5.0 (Windows NT 6.1; compatible; BDCbot/1.0; +http://bigweb.bigdatacorp.com.br/faq.aspx) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36" - ] - } - , - { - "pattern": "ZumBot", - "addition_date": "2018/02/09", - "instances": [ - "Mozilla/5.0 (compatible; ZumBot/1.0; http://help.zum.com/inquiry)" - ] - } - , - { - "pattern": "EZID", - "addition_date": "2018/02/09", - "instances": [ - "EZID (EZID link checker; https://ezid.cdlib.org/)" - ] - } - , - { - "pattern": "ICC-Crawler", - "addition_date": "2018/02/28", - "instances": [ - "ICC-Crawler/2.0 (Mozilla-compatible; ; http://ucri.nict.go.jp/en/icccrawler.html)" - ], - "url": "http://ucri.nict.go.jp/en/icccrawler.html" - } - , - { - "pattern": "ArchiveBot", - "addition_date": "2018/02/28", - "instances": [ - "ArchiveTeam ArchiveBot/20170106.02 (wpull 2.0.2)" - ], - "url": "https://github.com/ArchiveTeam/ArchiveBot" - } - , - { - "pattern": "^LCC ", - "addition_date": "2018/02/28", - "instances": [ - "LCC (+http://corpora.informatik.uni-leipzig.de/crawler_faq.html)" - ], - "url": "http://corpora.informatik.uni-leipzig.de/crawler_faq.html" - } - , - { - "pattern": "filterdb.iss.net\\/crawler", - "addition_date": "2018/03/16", - "instances": [ - "Mozilla/5.0 (compatible; oBot/2.3.1; +http://filterdb.iss.net/crawler/)" - ], - "url": "http://filterdb.iss.net/crawler/" - } - , - { - "pattern": "BLP_bbot", - "addition_date": "2018/03/27", - "instances": [ - "BLP_bbot/0.1" - ] - } - , - { - "pattern": "BomboraBot", - "addition_date": "2018/03/27", - "instances": [ - "Mozilla/5.0 (compatible; BomboraBot/1.0; +http://www.bombora.com/bot)" - ], - "url": "http://www.bombora.com/bot" - } - , - { - "pattern": "Buck\\/", - "addition_date": "2018/03/27", - "instances": [ - "Buck/2.2; (+https://app.hypefactors.com/media-monitoring/about.html)" - ], - "url": "https://app.hypefactors.com/media-monitoring/about.html" - } - , - { - "pattern": "Companybook-Crawler", - "addition_date": "2018/03/27", - "instances": [ - "Companybook-Crawler (+https://www.companybooknetworking.com/)" - ], - "url": "https://www.companybooknetworking.com/" - } - , - { - "pattern": "Genieo", - "addition_date": "2018/03/27", - "instances": [ - "Mozilla/5.0 (compatible; Genieo/1.0 http://www.genieo.com/webfilter.html)" - ], - "url": "http://www.genieo.com/webfilter.html" - } - , - { - "pattern": "magpie-crawler", - "addition_date": "2018/03/27", - "instances": [ - "magpie-crawler/1.1 (U; Linux amd64; en-GB; +http://www.brandwatch.net)" - ], - "url": "http://www.brandwatch.net" - } - , - { - "pattern": "MeltwaterNews", - "addition_date": "2018/03/27", - "instances": [ - "MeltwaterNews www.meltwater.com" - ], - "url": "http://www.meltwater.com" - } - , - { - "pattern": "Moreover", - "addition_date": "2018/03/27", - "instances": [ - "Mozilla/5.0 Moreover/5.1 (+http://www.moreover.com)" - ], - "url": "http://www.moreover.com" - } - , - { - "pattern": "newspaper\\/", - "addition_date": "2018/03/27", - "instances": [ - "newspaper/0.2.5", - "newspaper/0.2.6", - "newspaper/0.1.0.7" - ] - } - , - { - "pattern": "ScoutJet", - "addition_date": "2018/03/27", - "instances": [ - "Mozilla/5.0 (compatible; ScoutJet; +http://www.scoutjet.com/)" - ], - "url": "http://www.scoutjet.com/" - } - , - { - "pattern": "(^| )sentry\\/", - "addition_date": "2018/03/27", - "instances": [ - "sentry/8.22.0 (https://sentry.io)" - ], - "url": "https://sentry.io" - } - , - { - "pattern": "StorygizeBot", - "addition_date": "2018/03/27", - "instances": [ - "Mozilla/5.0 (compatible; StorygizeBot; http://www.storygize.com)" - ], - "url": "http://www.storygize.com" - } - , - { - "pattern": "UptimeRobot", - "addition_date": "2018/03/27", - "instances": [ - "Mozilla/5.0+(compatible; UptimeRobot/2.0; http://www.uptimerobot.com/)" - ], - "url": "http://www.uptimerobot.com/" - } - , - { - "pattern": "OutclicksBot", - "addition_date": "2018/04/21", - "instances": [ - "OutclicksBot/2 +https://www.outclicks.net/agent/VjzDygCuk4ubNmg40ZMbFqT0sIh7UfOKk8s8ZMiupUR", - "OutclicksBot/2 +https://www.outclicks.net/agent/gIYbZ38dfAuhZkrFVl7sJBFOUhOVct6J1SvxgmBZgCe", - "OutclicksBot/2 +https://www.outclicks.net/agent/PryJzTl8POCRHfvEUlRN5FKtZoWDQOBEvFJ2wh6KH5J", - "OutclicksBot/2 +https://www.outclicks.net/agent/p2i4sNUh7eylJF1S6SGgRs5mP40ExlYvsr9GBxVQG6h" - ], - "url": "https://www.outclicks.net" - } - , - { - "pattern": "seoscanners", - "addition_date": "2018/05/27", - "instances": [ - "Mozilla/5.0 (compatible; seoscanners.net/1; +spider@seoscanners.net)" - ], - "url": "http://www.seoscanners.net/" - } - , - { - "pattern": "Hatena", - "addition_date": "2018/05/29", - "instances": [ - "Hatena Antenna/0.3", - "Hatena::Russia::Crawler/0.01" - ] - } - , - { - "pattern": "Google Web Preview", - "addition_date": "2018/05/31", - "instances": [ - "Mozilla/5.0 (Linux; U; Android 2.3.4; generic) AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview) Version/4.0 Mobile Safari/537.36" - ] - } - , - { - "pattern": "MauiBot", - "addition_date": "2018/06/06", - "instances": [ - "MauiBot (crawler.feedback+wc@gmail.com)" - ] - } - , - { - "pattern": "AlphaBot", - "addition_date": "2018/05/27", - "instances": [ - "Mozilla/5.0 (compatible; AlphaBot/3.2; +http://alphaseobot.com/bot.html)" - ], - "url": "http://alphaseobot.com/bot.html" - } - , - { - "pattern": "SBL-BOT", - "addition_date": "2018/06/06", - "instances": [ - "SBL-BOT (http://sbl.net)" - ], - "url": "http://sbl.net", - "description" : "Bot of SoftByte BlackWidow" - } - , - { - "pattern": "IAS crawler", - "addition_date": "2018/06/06", - "instances": [ - "IAS crawler (ias_crawler; http://integralads.com/site-indexing-policy/)" - ], - "url": "http://integralads.com/site-indexing-policy/", - "description" : "Bot of Integral Ad Science, Inc." - } - , - { - "pattern": "adscanner", - "addition_date": "2018/06/24", - "instances": [ - "Mozilla/5.0 (compatible; adscanner/)" - ] - } - , - { - "pattern": "Netvibes", - "addition_date": "2018/06/24", - "instances": [ - "Netvibes (crawler/bot; http://www.netvibes.com" - ], - "url": "http://www.netvibes.com" - } - , - { - "pattern": "acapbot", - "addition_date": "2018/06/27", - "instances": [ - "Mozilla/5.0 (compatible;acapbot/0.1;treat like Googlebot)", - "Mozilla/5.0 (compatible;acapbot/0.1.;treat like Googlebot)" - ] - } - , - { - "pattern": "Baidu-YunGuanCe", - "addition_date": "2018/06/27", - "instances": [ - "Baidu-YunGuanCe-Bot(ce.baidu.com)", - "Baidu-YunGuanCe-SLABot(ce.baidu.com)", - "Baidu-YunGuanCe-ScanBot(ce.baidu.com)", - "Baidu-YunGuanCe-PerfBot(ce.baidu.com)", - "Baidu-YunGuanCe-VSBot(ce.baidu.com)" - ], - "url": "https://ce.baidu.com/topic/topic20150908", - "description": "Baidu Cloud Watch" - } - , - { - "pattern": "bitlybot", - "addition_date": "2018/06/27", - "instances": [ - "bitlybot/3.0 (+http://bit.ly/)", - "bitlybot/2.0", - "bitlybot" - ], - "url": "http://bit.ly/" - } - , - { - "pattern": "blogmuraBot", - "addition_date": "2018/06/27", - "instances": [ - "blogmuraBot (+http://www.blogmura.com)" - ], - "url": "http://www.blogmura.com", - "description": "A blog ranking site which links to blogs on just about every theme possible." - } - , - { - "pattern": "Bot.AraTurka.com", - "addition_date": "2018/06/27", - "instances": [ - "Bot.AraTurka.com/0.0.1" - ], - "url": "http://www.araturka.com" - } - , - { - "pattern": "bot-pge.chlooe.com", - "addition_date": "2018/06/27", - "instances": [ - "bot-pge.chlooe.com/1.0.0 (+http://www.chlooe.com/)" - ] - } - , - { - "pattern": "BoxcarBot", - "addition_date": "2018/06/27", - "instances": [ - "Mozilla/5.0 (compatible; BoxcarBot/1.1; +awesome@boxcar.io)" - ], - "url": "https://boxcar.io/" - } - , - { - "pattern": "BTWebClient", - "addition_date": "2018/06/27", - "instances": [ - "BTWebClient/180B(9704)" - ], - "url": "http://www.utorrent.com/", - "description": "µTorrent BitTorrent Client" - } - , - { - "pattern": "ContextAd Bot", - "addition_date": "2018/06/27", - "instances": [ - "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0;.NET CLR 1.0.3705; ContextAd Bot 1.0)", - "ContextAd Bot 1.0" - ] - } - , - { - "pattern": "Digincore bot", - "addition_date": "2018/06/27", - "instances": [ - "Mozilla/5.0 (compatible; Digincore bot; https://www.digincore.com/crawler.html for rules and instructions.)" - ], - "url": "http://www.digincore.com/crawler.html" - } - , - { - "pattern": "Disqus", - "addition_date": "2018/06/27", - "instances": [ - "Disqus/1.0" - ], - "url": "https://disqus.com/", - "description": "validate and quality check pages." - } - , - { - "pattern": "Feedly", - "addition_date": "2018/06/27", - "instances": [ - "Feedly/1.0 (+http://www.feedly.com/fetcher.html; like FeedFetcher-Google)", - "FeedlyBot/1.0 (http://feedly.com)" - ], - "url": "https://www.feedly.com/fetcher.html", - "description": "Feedly Fetcher is how Feedly grabs RSS or Atom feeds when users choose to add them to their Feedly or any of the other applications built on top of the feedly cloud." - } - , - { - "pattern": "Fetch\\/", - "addition_date": "2018/06/27", - "instances": [ - "Fetch/2.0a (CMS Detection/Web/SEO analysis tool, see http://guess.scritch.org)" - ] - } - , - { - "pattern": "Fever", - "addition_date": "2018/06/27", - "instances": [ - "Fever/1.38 (Feed Parser; http://feedafever.com; Allow like Gecko)" - ], - "url": "http://feedafever.com" - } - , - { - "pattern": "Flamingo_SearchEngine", - "addition_date": "2018/06/27", - "instances": [ - "Flamingo_SearchEngine (+http://www.flamingosearch.com/bot)" - ] - } - , - { - "pattern": "FlipboardProxy", - "addition_date": "2018/06/27", - "instances": [ - "Mozilla/5.0 (compatible; FlipboardProxy/1.1; +http://flipboard.com/browserproxy)", - "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6 (FlipboardProxy/1.1; +http://flipboard.com/browserproxy)", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:28.0) Gecko/20100101 Firefox/28.0 (FlipboardProxy/1.1; +http://flipboard.com/browserproxy)" - ], - "url": "https://about.flipboard.com/browserproxy/", - "description": "a proxy service to fetch, validate, and prepare certain elements of websites for presentation through the Flipboard Application" - } - , - { - "pattern": "g2reader-bot", - "addition_date": "2018/06/27", - "instances": [ - "g2reader-bot/1.0 (+http://www.g2reader.com/)" - ], - "url": "http://www.g2reader.com/" - } - , - { - "pattern": "imrbot", - "addition_date": "2018/06/27", - "instances": [ - "Mozilla/5.0 (compatible; imrbot/1.10.8 +http://www.mignify.com)" - ], - "url": "http://www.mignify.com" - } - , - { - "pattern": "K7MLWCBot", - "addition_date": "2018/06/27", - "instances": [ - "K7MLWCBot/1.0 (+http://www.k7computing.com)" - ], - "url": "http://www.k7computing.com", - "description": "Virus scanner" - } - , - { - "pattern": "Kemvibot", - "addition_date": "2018/06/27", - "instances": [ - "Kemvibot/1.0 (http://kemvi.com, marco@kemvi.com)" - ], - "url": "http://kemvi.com" - } - , - { - "pattern": "Landau-Media-Spider", - "addition_date": "2018/06/27", - "instances": [ - "Landau-Media-Spider/1.0(http://bots.landaumedia.de/bot.html)" - ], - "url": "http://bots.landaumedia.de/bot.html" - } - , - { - "pattern": "linkapediabot", - "addition_date": "2018/06/27", - "instances": [ - "linkapediabot (+http://www.linkapedia.com)" - ], - "url": "http://www.linkapedia.com" - } - , - { - "pattern": "vkShare", - "addition_date": "2018/07/02", - "instances": [ - "Mozilla/5.0 (compatible; vkShare; +http://vk.com/dev/Share)" - ], - "url": "http://vk.com/dev/Share" - } - , - { - "pattern": "Siteimprove.com", - "addition_date": "2018/06/22", - "instances": [ - "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) LinkCheck by Siteimprove.com", - "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.0) Match by Siteimprove.com", - "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) SiteCheck-sitecrawl by Siteimprove.com", - "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.0) LinkCheck by Siteimprove.com" - ] - } - , - { - "pattern": "BLEXBot\\/", - "addition_date": "2018/07/07", - "instances": [ - "Mozilla/5.0 (compatible; BLEXBot/1.0; +http://webmeup-crawler.com/)" - ], - "url": "http://webmeup-crawler.com" - } - , - { - "pattern": "DareBoost", - "addition_date": "2018/07/07", - "instances": [ - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36 DareBoost" - ], - "url": "https://www.dareboost.com/", - "description": "Bot to test, Analyze and Optimize website" - } - , - { - "pattern": "ZuperlistBot\\/", - "addition_date": "2018/07/07", - "instances": [ - "Mozilla/5.0 (compatible; ZuperlistBot/1.0)" - ] - } - , - { - "pattern": "Miniflux\\/", - "addition_date": "2018/07/07", - "instances": [ - "Mozilla/5.0 (compatible; Miniflux/2.0.7; +https://miniflux.net)" - ], - "url": "https://miniflux.net", - "description": "Miniflux is a minimalist and opinionated feed reader." - } - , - { - "pattern": "Feedspotbot\\/", - "addition_date": "2018/07/07", - "instances": [ - "Mozilla/5.0 (compatible; Feedspotbot/1.0; +http://www.feedspot.com/fs/bot)" - ], - "url": "http://www.feedspot.com/fs/bot" - } - , - { - "pattern": "Diffbot\\/", - "addition_date": "2018/07/07", - "instances": [ - "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2 (.NET CLR 3.5.30729; Diffbot/0.1; +http://www.diffbot.com)" - ], - "url": "http://www.diffbot.com" - } - , - { - "pattern": "SEOkicks", - "addition_date": "2018/08/22", - "instances": [ - "Mozilla/5.0 (compatible; SEOkicks; +https://www.seokicks.de/robot.html)" - ], - "url": "https://www.seokicks.de/robot.html" - } - , - { - "pattern": "tracemyfile", - "addition_date": "2018/08/23", - "instances": [ - "Mozilla/5.0 (compatible; tracemyfile/1.0; +bot@tracemyfile.com)" - ] - } - , - { - "pattern": "Nimbostratus-Bot", - "addition_date": "2018/08/29", - "instances": [ - "Mozilla/5.0 (compatible; Nimbostratus-Bot/v1.3.2; http://cloudsystemnetworks.com)" - ] - } - , - { - "pattern": "zgrab", - "addition_date": "2018/08/30", - "instances": [ - "Mozilla/5.0 zgrab/0.x" - ], - "url": "https://zmap.io/" - } - , - { - "pattern": "PR-CY.RU", - "addition_date": "2018/08/30", - "instances": [ - "Mozilla/5.0 (compatible; PR-CY.RU; + https://a.pr-cy.ru)" - ], - "url": "https://a.pr-cy.ru/" - } - , - { - "pattern": "AdsTxtCrawler", - "addition_date": "2018/08/30", - "instances": [ - "AdsTxtCrawler/1.0" - ] - }, - { - "pattern": "Datafeedwatch", - "addition_date": "2018/09/05", - "instances": [ - "Datafeedwatch/2.1.x" - ], - "url": "https://www.datafeedwatch.com/" - } - , - { - "pattern": "Zabbix", - "addition_date": "2018/09/05", - "instances": [ - "Zabbix" - ], - "url": "https://www.zabbix.com/documentation/3.4/manual/web_monitoring" - } - , - { - "pattern": "TangibleeBot", - "addition_date": "2018/09/05", - "instances": [ - "TangibleeBot/1.0.0.0 (http://tangiblee.com/bot)" - ], - "url": "http://tangiblee.com/bot" - } - , - { - "pattern": "google-xrawler", - "addition_date": "2018/09/05", - "instances": [ - "google-xrawler" - ], - "url": "https://webmasters.stackexchange.com/questions/105560/what-is-the-google-xrawler-user-agent-used-for" - } - , - { - "pattern": "axios", - "addition_date": "2018/09/06", - "instances": [ - "axios/0.18.0" - ], - "url": "https://github.com/axios/axios" - } - , - { - "pattern": "Amazon CloudFront", - "addition_date": "2018/09/07", - "instances": [ - "Amazon CloudFront" - ], - "url": "https://aws.amazon.com/cloudfront/" - } - , - { - "pattern": "Pulsepoint", - "addition_date": "2018/09/24", - "instances": [ - "Pulsepoint XT3 web scraper" - ] - } - , - { - "pattern": "CloudFlare-AlwaysOnline", - "addition_date": "2018/09/27", - "instances": [ - "Mozilla/5.0 (compatible; CloudFlare-AlwaysOnline/1.0; +http://www.cloudflare.com/always-online) AppleWebKit/534.34", - "Mozilla/5.0 (compatible; CloudFlare-AlwaysOnline/1.0; +https://www.cloudflare.com/always-online) AppleWebKit/534.34" - ], - "url" : "https://www.cloudflare.com/always-online/" - } -] diff --git a/lib/tasks/voight_kampff.rake b/lib/tasks/voight_kampff.rake deleted file mode 100644 index e7d6836..0000000 --- a/lib/tasks/voight_kampff.rake +++ /dev/null @@ -1,18 +0,0 @@ -namespace :voight_kampff do - desc 'Import a new crawler-user-agents.json file' - task :import_user_agents, :url do |t, args| - args.with_defaults url: 'https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json' - - require 'net/http' - - uri = URI(args[:url]) - contents = Net::HTTP.get(uri) - - if contents.present? - file = File.open('./config/crawler-user-agents.json', 'w') - file.write(contents.force_encoding(Encoding::UTF_8)) - else - puts "voight_kampff:import_user_agents - empty file received from #{uri}" - end - end -end diff --git a/lib/voight_kampff.rb b/lib/voight_kampff.rb index 0eb44fb..30f158f 100644 --- a/lib/voight_kampff.rb +++ b/lib/voight_kampff.rb @@ -1,17 +1,12 @@ -require 'json' +# frozen_string_literal: true require 'voight_kampff/test' require 'voight_kampff/methods' -require 'voight_kampff/rack_request' if defined?(Rack::Request) require 'voight_kampff/engine' if defined?(Rails) +# Class helper methods module VoightKampff class << self - def root - require 'pathname' - Pathname.new File.expand_path '..', File.dirname(__FILE__) - end - def human?(user_agent_string) test(user_agent_string).human? end @@ -19,7 +14,7 @@ def human?(user_agent_string) def bot?(user_agent_string) test(user_agent_string).bot? end - alias :replicant? :bot? + alias replicant? bot? private diff --git a/lib/voight_kampff/engine.rb b/lib/voight_kampff/engine.rb index 6f8f96f..1bd2402 100644 --- a/lib/voight_kampff/engine.rb +++ b/lib/voight_kampff/engine.rb @@ -1,10 +1,13 @@ +# frozen_string_literal: true + module VoightKampff + # Integration with Rails class Engine < Rails::Engine rake_tasks do load 'tasks/voight_kampff.rake' end - initializer :add_voight_kampff_methods do |app| + initializer :add_voight_kampff_methods do |_app| ActionDispatch::Request.class_eval do include VoightKampff::Methods end diff --git a/lib/voight_kampff/methods.rb b/lib/voight_kampff/methods.rb index 67e4e0e..c932b24 100644 --- a/lib/voight_kampff/methods.rb +++ b/lib/voight_kampff/methods.rb @@ -1,10 +1,15 @@ -module VoightKampff::Methods - def human? - VoightKampff::Test.new(user_agent).human? - end +# frozen_string_literal: true + +module VoightKampff + # Helper for Rack::Request + module Methods + extend Forwardable + def_delegators :voight_kampff, :human?, :bot?, :replicant? + + private - def bot? - VoightKampff::Test.new(user_agent).bot? + def voight_kampff + VoightKampff::Test.new(user_agent) + end end - alias :replicant? :bot? end diff --git a/lib/voight_kampff/rack_request.rb b/lib/voight_kampff/rack_request.rb index a7e1e00..26f42fc 100644 --- a/lib/voight_kampff/rack_request.rb +++ b/lib/voight_kampff/rack_request.rb @@ -1,4 +1,4 @@ +# frozen_string_literal: true + # Reopen the Rack::Request class to add bot detection methods -Rack::Request.class_eval do - include VoightKampff::Methods -end +Rack::Request.include VoightKampff::Methods if defined?(Rack::Request) diff --git a/lib/voight_kampff/test.rb b/lib/voight_kampff/test.rb index 5897098..1d66281 100644 --- a/lib/voight_kampff/test.rb +++ b/lib/voight_kampff/test.rb @@ -1,6 +1,79 @@ +# frozen_string_literal: true + +require 'net/http' +require 'json' + module VoightKampff + # Test User-Agent against Voight-Kampff class Test - CRAWLERS_FILENAME = 'crawler-user-agents.json' + DEFAULT_URL = + 'https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json' + CRAWLERS_FILENAME = File.basename(DEFAULT_URL).freeze + + class << self + def cache_path + @cache_path ||= "#{Dir.pwd}/tmp/#{CRAWLERS_FILENAME}" + end + + def cache_path=(value) + @cache_path = File.expand_path value + end + + def max_age + @max_age ||= 24 * 3600 + end + + attr_writer :max_age + + def crawlers + return @crawlers if @crawlers && fresh?(@last_updated_at) + + @last_updated_at = Time.now + @crawler_regexp = nil + @crawlers = JSON.parse(load_crawlers) + end + + def crawler_regexp + @crawler_regexp ||= begin + # NOTE: This is admittedly a bit convoluted + # but the performance gains make it worthwhile + crawler_patterns = + crawlers.map.with_index do |crawler, index| + "(?#{crawler['pattern']})" + end.join('|') + crawler_patterns = "(#{crawler_patterns})" + Regexp.new(crawler_patterns, Regexp::IGNORECASE) + end + end + + def load_crawlers + load_local_crawlers || load_remote_crawlers + end + + private + + def fresh?(timestamp) + return false unless timestamp + + timestamp >= Time.now - max_age + end + + def load_remote_crawlers(uri = DEFAULT_URL) + crawlers = Net::HTTP.get(URI(uri)).force_encoding('UTF-8') + if cache_path + FileUtils.mkdir_p File.dirname cache_path + File.write(cache_path, crawlers) + end + crawlers + end + + def load_local_crawlers + if cache_path && File.exist?(cache_path) && + fresh?(File.mtime(cache_path)) + File.read(cache_path) + end + end + end attr_accessor :user_agent_string @@ -19,42 +92,16 @@ def human? def bot? !human? end - alias :replicant? :bot? + alias replicant? bot? private - def lookup_paths - # These paths should be orderd by priority - base_paths = [] - base_paths << Rails.root if defined? Rails - base_paths << VoightKampff.root - - base_paths.map { |p| p.join('config', CRAWLERS_FILENAME) } - end - - def preferred_path - lookup_paths.find { |path| File.exists? path } - end - def matching_crawler - if match = crawler_regexp.match(@user_agent_string) - index = match.names.first.sub(/match/, '').to_i - crawlers[index] - end - end - - def crawler_regexp - @@crawler_regexp ||= begin - # NOTE: This is admittedly a bit convoluted but the performance gains make it worthwhile - index = -1 - crawler_patterns = crawlers.map{|c| index += 1; "(?#{c["pattern"]})" }.join("|") - crawler_patterns = "(#{crawler_patterns})" - Regexp.new(crawler_patterns, Regexp::IGNORECASE) - end - end + match = self.class.crawler_regexp.match(@user_agent_string) + return unless match&.names&.any? - def crawlers - @@crawlers ||= JSON.load(File.open(preferred_path, 'r')) + index = match.names.first.sub(/match/, '').to_i + self.class.crawlers[index] end end end diff --git a/lib/voight_kampff/version.rb b/lib/voight_kampff/version.rb index 7b3d9e5..14c25dd 100644 --- a/lib/voight_kampff/version.rb +++ b/lib/voight_kampff/version.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module VoightKampff - VERSION = '1.1.3' + VERSION = '2.0.0' end diff --git a/spec/controllers/replicants_controller_spec.rb b/spec/controllers/replicants_controller_spec.rb index 502df1d..8524b2b 100644 --- a/spec/controllers/replicants_controller_spec.rb +++ b/spec/controllers/replicants_controller_spec.rb @@ -1,9 +1,12 @@ +# frozen_string_literal: true + require 'spec_helper' describe ReplicantsController, type: :controller do let(:user_agent_string) { '' } before do - expect_any_instance_of(ActionController::TestRequest).to receive(:user_agent).and_return user_agent_string + expect_any_instance_of(ActionController::TestRequest) + .to receive(:user_agent).and_return user_agent_string get :index end diff --git a/spec/fixtures/crawler-user-agents.json b/spec/fixtures/crawler-user-agents.json new file mode 100644 index 0000000..90f9ca3 --- /dev/null +++ b/spec/fixtures/crawler-user-agents.json @@ -0,0 +1,43 @@ +[ + { + "pattern": "Googlebot\\/", + "url": "http://www.google.com/bot.html", + "instances": [ + "Googlebot/2.1 (+http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36", + "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; Google Web Preview Analytics) Chrome/27.0.1453 Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" + ] + }, + { + "pattern": "bingbot", + "url": "http://www.bing.com/bingbot.htm", + "instances": [ + "Mozilla/5.0 (Windows Phone 8.1; ARM; Trident/7.0; Touch; rv:11.0; IEMobile/11.0; NOKIA; Lumia 530) like Gecko (compatible; adidxbot/2.0; +http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (compatible; adidxbot/2.0; http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (compatible; adidxbot/2.0; +http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) SitemapProbe", + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; adidxbot/2.0; http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; adidxbot/2.0; +http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (seoanalyzer; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" + ] + }, + { + "pattern": "slurp", + "url": "http://help.yahoo.com/help/us/ysearch/slurp", + "instances": [ + "Mozilla/5.0 (compatible; Yahoo! Slurp/3.0; http://help.yahoo.com/help/us/ysearch/slurp)", + "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)" + ] + } +] diff --git a/spec/internal/app/controllers/replicants_controller.rb b/spec/internal/app/controllers/replicants_controller.rb index 5da67eb..93b5975 100644 --- a/spec/internal/app/controllers/replicants_controller.rb +++ b/spec/internal/app/controllers/replicants_controller.rb @@ -1,12 +1,15 @@ +# frozen_string_literal: true + class ReplicantsController < ActionController::Base def index header = "Replicants:\n===========\n" - status, content = if request.bot? - [200, '- Rick Deckard'] - else - [403, 'No replicants here'] - end + status, content = + if request.bot? + [200, '- Rick Deckard'] + else + [403, 'No replicants here'] + end render plain: header + content, status: status end diff --git a/spec/internal/config/routes.rb b/spec/internal/config/routes.rb index ea33d34..c1869af 100644 --- a/spec/internal/config/routes.rb +++ b/spec/internal/config/routes.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + Rails.application.routes.draw do resources :replicants, only: :index root to: 'replicants#index' diff --git a/spec/lib/voight_kampff/rack_request_spec.rb b/spec/lib/voight_kampff/rack_request_spec.rb index 73174f4..0978cd5 100644 --- a/spec/lib/voight_kampff/rack_request_spec.rb +++ b/spec/lib/voight_kampff/rack_request_spec.rb @@ -1,10 +1,14 @@ +# frozen_string_literal: true + require 'spec_helper' describe Rack::Request do - let(:user_agent_string) { } - let(:env) { {'HTTP_USER_AGENT' => user_agent_string} } + let(:user_agent_string) {} + let(:env) { { 'HTTP_USER_AGENT' => user_agent_string } } subject { Rack::Request.new(env) } + require_relative '../../../lib/voight_kampff/rack_request' + it { expect(subject).to respond_to :human? } it { expect(subject).to respond_to :bot? } it { expect(subject).to respond_to :replicant? } diff --git a/spec/lib/voight_kampff/test_spec.rb b/spec/lib/voight_kampff/test_spec.rb index e1e9d63..88a03e9 100644 --- a/spec/lib/voight_kampff/test_spec.rb +++ b/spec/lib/voight_kampff/test_spec.rb @@ -1,6 +1,122 @@ +# frozen_string_literal: true + require 'spec_helper' describe VoightKampff::Test do + describe 'cache' do + describe 'creating' do + before do + FileUtils.rm_rf cache_path if cache_path + + allow(described_class).to receive(:cache_path).and_return cache_path + end + + shared_examples 'created after loading without cache' do + it do + expect(File.exist?(cache_path)).to be false + + described_class.load_crawlers + + expect(File.exist?(cache_path)).to be true + end + end + + context 'default cache_path' do + let(:cache_path) { described_class.cache_path } + + it_behaves_like 'created after loading without cache' + end + + context 'custom absolute cache_path' do + after do + FileUtils.rm_rf "#{Dir.pwd}/custom" + end + + let(:cache_path) { "#{Dir.pwd}/custom/directory/with/file.json" } + + it_behaves_like 'created after loading without cache' + end + + context 'custom relative cache_path' do + after do + FileUtils.rm_rf './custom' + end + + let(:cache_path) { './custom/directory/with/file.json' } + + it_behaves_like 'created after loading without cache' + end + + context 'without cache_path' do + let(:cache_path) { nil } + + before do + expect(File).not_to receive(:write) + end + + it { described_class.load_crawlers } + end + end + + describe 'updating' do + describe 'class instance variable' do + before do + described_class.crawlers + + described_class.instance_variable_set( + :@last_updated_at, fake_updated_at + ) + end + + context 'fresh' do + let(:fake_updated_at) { Time.now - described_class.max_age + 1 } + + it do + expect(described_class).not_to receive(:load_crawlers) + described_class.crawlers + end + end + + context 'stale' do + let(:fake_updated_at) { Time.now - described_class.max_age - 1 } + + it do + expect(described_class).to receive(:load_crawlers).and_call_original + described_class.crawlers + end + end + end + + describe 'file' do + before do + described_class.load_crawlers + + allow(File).to receive(:mtime).with(described_class.cache_path) + .and_return(fake_mtime) + end + + context 'fresh' do + let(:fake_mtime) { Time.now - described_class.max_age + 1 } + + it do + expect(described_class).not_to receive(:load_remote_crawlers) + described_class.load_crawlers + end + end + + context 'stale' do + let(:fake_mtime) { Time.now - described_class.max_age - 1 } + + it do + expect(described_class).to receive(:load_remote_crawlers) + .and_call_original + described_class.load_crawlers + end + end + end + end + end + let(:user_agent_string) { nil } subject { VoightKampff::Test.new(user_agent_string) } @@ -27,14 +143,18 @@ end context 'after the first run' do - before { VoightKampff::Test.new('anything').bot? } + def time_of_run + Benchmark.realtime { VoightKampff::Test.new('anything').bot? } + end - it 'is fast' do - expect( - Benchmark.realtime do - 20.times { VoightKampff::Test.new('anything').bot? } - end - ).to be < 0.003 + let(:number_of_runs) { 20 } + + times_faster = 2 + + it "is at least #{times_faster} times faster" do + expect(time_of_run / times_faster).to be > ( + (1..number_of_runs).map { time_of_run }.sum / number_of_runs + ) end end end diff --git a/spec/lib/voight_kampff_spec.rb b/spec/lib/voight_kampff_spec.rb index 0638348..721e862 100644 --- a/spec/lib/voight_kampff_spec.rb +++ b/spec/lib/voight_kampff_spec.rb @@ -1,9 +1,11 @@ +# frozen_string_literal: true + require 'spec_helper' describe VoightKampff do subject { VoightKampff } - HUMANS.each do |name, ua_string| + HUMANS.each do |_name, ua_string| context "when user agent is #{ua_string}" do let(:user_agent_string) { ua_string } @@ -14,7 +16,7 @@ end end - REPLICANTS.each do |name, ua_string| + REPLICANTS.each do |_name, ua_string| context "when user agent is #{ua_string}" do let(:user_agent_string) { ua_string } diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 9140cfa..3b97912 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,6 +1,9 @@ +# frozen_string_literal: true + require 'bundler/setup' +require 'pry-byebug' require 'combustion' -require 'voight_kampff' +require_relative '../lib/voight_kampff' Combustion.initialize! :action_controller @@ -8,4 +11,12 @@ Dir['./spec/support/**/*.rb'].each { |f| require f } RSpec.configure do |config| + config.before do + allow(Net::HTTP).to receive(:get).with(URI(VoightKampff::Test::DEFAULT_URL)) + .and_return File.read("#{__dir__}/fixtures/crawler-user-agents.json") + end + + config.after(:suite) do + FileUtils.rm_rf VoightKampff::Test.cache_path + end end diff --git a/spec/support/humans.rb b/spec/support/humans.rb index c2e5654..31d43bf 100644 --- a/spec/support/humans.rb +++ b/spec/support/humans.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + HUMANS = { 'Unknown' => nil, # for the moment we're treating a blank user agent string as not a bot 'Chrome' => 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', @@ -6,4 +8,4 @@ 'Internet Explorer' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko', 'Chrome Mobile' => 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19', 'Safari for iOS' => 'Mozilla/5.0 (iPad; CPU OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53' -} +}.freeze diff --git a/spec/support/replicants.rb b/spec/support/replicants.rb index 3885d4d..1df2b74 100644 --- a/spec/support/replicants.rb +++ b/spec/support/replicants.rb @@ -1,5 +1,7 @@ +# frozen_string_literal: true + REPLICANTS = { 'Googlebot' => 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)', 'Bingbot' => 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)', 'Yahoo! Slurp' => 'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)' -} +}.freeze diff --git a/voight_kampff.gemspec b/voight_kampff.gemspec index 342dc67..a89edc5 100644 --- a/voight_kampff.gemspec +++ b/voight_kampff.gemspec @@ -1,29 +1,30 @@ -# -*- encoding: utf-8 -*- -$:.unshift File.expand_path('../lib', __FILE__) -require 'voight_kampff/version' +# frozen_string_literal: true + +require_relative 'lib/voight_kampff/version' Gem::Specification.new do |s| s.name = 'voight_kampff' - s.summary = "Voight-Kampff bot detection" + s.summary = 'Voight-Kampff bot detection' s.description = 'Voight-Kampff detects bots, spiders, crawlers and replicants' - s.licenses = ['MIT'] + s.licenses = ['MIT'] - s.author = "Adam Crownoble" - s.email = "adam@codenoble.com" - s.homepage = "https://github.com/biola/Voight-Kampff" + s.author = 'Adam Crownoble' + s.email = 'adam@codenoble.com' + s.homepage = 'https://github.com/biola/Voight-Kampff' - # so that rubygems does not uses the actual object - s.version = VoightKampff::VERSION.dup - s.platform = Gem::Platform::RUBY.dup + s.version = VoightKampff::VERSION s.files = `git ls-files`.split("\n") s.files.reject! { |fn| fn.match(/\.travis.yml/) } - s.test_files = `git ls-files -- {tests}/**/*`.split("\n") + s.test_files = `git ls-files -- spec/**/*`.split("\n") s.require_path = 'lib' - s.add_dependency 'rack', ['>= 1.4', '< 3.0'] + s.required_ruby_version = '>= 2.4' s.add_development_dependency 'combustion', '~> 1.1' + s.add_development_dependency 'pry-byebug', '~> 3.7' + s.add_development_dependency 'rack', ['>= 1.4', '< 3.0'] s.add_development_dependency 'rails', '~> 5.2' s.add_development_dependency 'rspec-rails', '~> 3.8' + s.add_development_dependency 'rubocop', '~> 0.72.0' end