Skip to content

Commit

Permalink
Import fixture files from upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
mngsk committed Feb 1, 2021
1 parent 0d91ac5 commit bc60d53
Show file tree
Hide file tree
Showing 54 changed files with 42,619 additions and 1,597 deletions.
14 changes: 7 additions & 7 deletions README.md

Large diffs are not rendered by default.

153 changes: 124 additions & 29 deletions src/main/resources/regexes/bots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
- regex: '360Spider(-Image|-Video)?'
name: '360Spider'
category: 'Search bot'
url: 'http://www.so.com/help/help_3_2.html'
url: 'https://www.so.com/help/help_3_2.html'
producer:
name: 'Online Media Group, Inc.'
url: ''
Expand Down Expand Up @@ -40,26 +40,26 @@
- regex: 'AhrefsBot'
name: 'aHrefs Bot'
category: 'Crawler'
url: 'http://ahrefs.com/robot'
url: 'https://ahrefs.com/robot'
producer:
name: 'Ahrefs Pte Ltd'
url: 'http://ahrefs.com/robot'
url: 'https://ahrefs.com/robot'

- regex: 'ia_archiver|alexabot|verifybot'
name: 'Alexa Crawler'
category: 'Search bot'
url: 'https://alexa.zendesk.com/hc/en-us/sections/200100794-Crawlers'
url: 'https://support.alexa.com/hc/en-us/sections/200100794-Crawlers'
producer:
name: 'Alexa Internet'
url: 'http://www.alexa.com'
url: 'https://www.alexa.com'

- regex: 'alexa site audit'
name: 'Alexa Site Audit'
category: 'Site Monitor'
url: 'http://www.alexa.com/help/webmasters'
url: 'https://support.alexa.com/hc/en-us/articles/200450194'
producer:
name: 'Alexa Internet'
url: 'http://www.alexa.com'
url: 'https://www.alexa.com'

- regex: 'Amazon[ -]Route ?53[ -]Health[ -]Check[ -]Service'
name: 'Amazon Route53 Health Check'
Expand All @@ -82,23 +82,23 @@
url: 'https://httpd.apache.org/docs/2.4/programs/ab.html'
producer:
name: 'The Apache Software Foundation'
url: 'http://www.apache.org/foundation/'
url: 'https://www.apache.org/foundation/'

- regex: 'Applebot'
name: 'Applebot'
category: 'Crawler'
url: 'http://www.apple.com/go/applebot'
url: 'https://support.apple.com/en-us/HT204683'
producer:
name: 'Apple Inc'
url: 'http://www.apple.com'
url: 'https://www.apple.com'

- regex: 'Arachni'
name: 'Arachni'
category: 'Security Checker'
url: 'http://www.arachni-scanner.com'
url: 'https://www.arachni-scanner.com/'
producer:
name: 'Sarosys LLC'
url: 'http://www.sarosys.com/'
url: 'https://www.sarosys.com/'

- regex: 'AspiegelBot'
name: 'AspiegelBot'
Expand All @@ -112,7 +112,7 @@
name: 'Castro 2'
category: 'Service Agent'
url: 'http://supertop.co/castro/'
producer:
producer:
name: 'Supertop'
url: 'http://supertop.co'

Expand All @@ -127,10 +127,10 @@
- regex: 'archive\.org_bot|special_archiver'
name: 'archive.org bot'
category: 'Crawler'
url: 'http://www.archive.org/details/archive.org_bot'
url: 'https://archive.org/details/archive.org_bot'
producer:
name: 'The Internet Archive'
url: 'http://www.archive.org'
url: 'https://archive.org'

- regex: 'Ask Jeeves/Teoma'
name: 'Ask Jeeves'
Expand Down Expand Up @@ -217,7 +217,7 @@
category: 'Crawler'
producer:
name: 'BoardReader'
url: 'http://boardreader.com/'
url: 'https://boardreader.com/'

- regex: 'BountiiBot'
name: 'Bountii Bot'
Expand Down Expand Up @@ -283,6 +283,14 @@
name: 'CloudFlare'
url: 'http://www.cloudflare.com'

- regex: 'CloudflareDiagnostics'
name: 'Cloudflare Diagnostics'
category: 'Site Monitor'
url: 'https://www.cloudflare.com/'
producer:
name: 'Cloudflare'
url: 'https://www.cloudflare.com'

- regex: 'CloudFlare-AlwaysOnline'
name: 'CloudFlare Always Online'
category: 'Site Monitor'
Expand Down Expand Up @@ -339,7 +347,6 @@
name: 'Datanyze'
url: 'https://www.datanyze.com'


- regex: 'Dataprovider'
name: 'Dataprovider'
category: 'Crawler'
Expand Down Expand Up @@ -649,7 +656,7 @@
name: 'Visual Meta'
url: 'https://www.shopalike.cz/'

- regex: 'AdsBot-Google(-Mobile)?|Adwords-(DisplayAds|Express|Instant)|Google Web Preview|Google[ -]Publisher[ -]Plugin|Google-(Adwords|AMPHTML|Assess|HotelAdsVerifier|Read-Aloud|Shopping-Quality|Site-Verification|speakr|Test|Youtube-Links)|(APIs|DuplexWeb|Feedfetcher|Mediapartners)-Google|Googlebot(-Mobile|-Image|-Video|-News)?|GoogleProducer|Google.*/\+/web/snippet'
- regex: 'AdsBot-Google(-Mobile)?|Adwords-(DisplayAds|Express|Instant)|Google Web Preview|Google[ -]Publisher[ -]Plugin|Google-(Ads-Qualify|Adwords|AMPHTML|Assess|HotelAdsVerifier|Read-Aloud|Shopping-Quality|Site-Verification|speakr|Test|Youtube-Links)|(APIs|DuplexWeb|Feedfetcher|Mediapartners)-Google|Googlebot(-Mobile|-Image|-Video|-News)?|GoogleProducer|Google.*/\+/web/snippet'
name: 'Googlebot'
category: 'Search bot'
url: 'http://www.google.com/bot.html'
Expand All @@ -663,7 +670,7 @@
url: 'https://webarchive.jira.com/wiki/display/Heritrix/Heritrix'
producer:
name: 'The Internet Archive'
url: 'http://www.archive.org'
url: 'https://archive.org'

- regex: 'HubSpot '
name: 'HubSpot'
Expand All @@ -672,7 +679,6 @@
name: 'HubSpot Inc.'
url: 'https://www.hubspot.com'


- regex: 'HTTPMon'
name: 'HTTPMon'
category: 'Site Monitor'
Expand Down Expand Up @@ -788,7 +794,7 @@
name: ''
url: ''

- regex : 'masscan'
- regex: 'masscan'
name: 'masscan'
url: 'https://github.com/robertdavidgraham/masscan'
category: 'Crawler'
Expand Down Expand Up @@ -941,7 +947,7 @@
category: 'Crawler'
producer:
name: 'Nuzzel'
url: https://www.nuzzel.com/
url: 'https://www.nuzzel.com/'

- regex: 'Octopus [0-9]'
name: 'Octopus'
Expand Down Expand Up @@ -1039,7 +1045,7 @@
name: 'Picsearch'
url: 'http://www.picsearch.com'

- regex: 'Pingdom\.com'
- regex: 'Pingdom(?:\.com|TMS)'
name: 'Pingdom Bot'
category: 'Site Monitor'
url: ''
Expand Down Expand Up @@ -1971,10 +1977,10 @@
- regex: 'BoardReader Favicon Fetcher'
name: 'BoardReader'
category: 'Search bot'
url: 'http://boardreader.com/'
url: 'https://boardreader.com/'
producer:
name: 'Effyis Inc'
url: 'http://boardreader.com/'
url: 'https://boardreader.com/'

- regex: 'IDG/IT'
name: 'IDG/IT'
Expand Down Expand Up @@ -2019,7 +2025,7 @@
- regex: 'oBot'
name: 'oBot'
category: 'Search bot'
url: 'http://www.xforce-security.com/crawler/'
url: 'https://www.xforce-security.com/crawler/'
producer:
name: 'IBM Germany Research & Development GmbH'
url: 'https://exchange.xforce.ibmcloud.com/'
Expand Down Expand Up @@ -2062,7 +2068,7 @@
url: 'https://nutch.apache.org'
producer:
name: 'The Apache Software Foundation'
url: 'http://www.apache.org/foundation/'
url: 'https://www.apache.org/foundation/'

- regex: 'Seobility'
name: 'Seobility'
Expand All @@ -2077,7 +2083,7 @@
- regex: 'Grammarly'
name: 'Grammarly'
category: 'Service bot'
url: 'http://www.grammarly.com'
url: 'https://www.grammarly.com'

- regex: 'Robozilla'
name: 'Robozilla'
Expand All @@ -2096,7 +2102,7 @@
- regex: 'SerendeputyBot'
name: 'Serendeputy Bot'
category: 'Crawler'
url: 'http://serendeputy.com/about/serendeputy-bot'
url: 'https://serendeputy.com/about/serendeputy-bot'

- regex: 'ias-va.*admantx.*service-fetcher'
name: 'ADmantX Service Fetcher'
Expand All @@ -2118,6 +2124,95 @@
category: 'Crawler'
url: 'http://www.exensa.com/crawl'

- regex: 'BDCbot'
name: 'BDCbot'
category: 'Crawler'
url: 'https://bigweb.bigdatacorp.com.br/pages/faq.aspx'
producer:
name: 'BIG Data Solucoes Em Tecnologia de Informatica LTDA'
url: 'https://bigdatacorp.com.br/'

- regex: 'adbeat'
name: 'Adbeat'
category: 'Crawler'
url: 'https://www.adbeat.com/operation_policy'
producer:
name: 'PPC Labs LLC'
url: 'https://www.adbeat.com/'

- regex: 'BW/(?:(\d+[\.\d]+))'
name: 'BuiltWith'
category: 'Crawler'
url: 'https://builtwith.com/biup'
producer:
name: 'BuiltWith Pty Ltd'
url: 'https://builtwith.com/'

- regex: 'https://whatis.contentkingapp.com'
name: 'ContentKing'
category: 'Site Monitor'
url: 'https://whatis.contentkingapp.com/'
producer:
name: 'ContentKing BV'
url: 'https://www.contentkingapp.com/'

- regex: 'MicroAdBot'
name: 'MicroAdBot'
category: 'Crawler'
url: 'https://www.microad.co.jp/'
producer:
name: 'MicroAd, Inc.'
url: 'https://www.microad.co.jp/'

- regex: 'PingAdmin.Ru'
name: 'PingAdmin.Ru'
category: 'Site Monitor'
url: 'https://ping-admin.ru/'

- regex: 'notifyninja.+monitoring'
name: 'Notify Ninja'
category: 'Site Monitor'
url: 'http://notifyninja.com'

- regex: 'WebDataStats'
name: 'WebDataStats'
category: 'Crawler'
url: 'https://webdatastats.com/policy.html'
producer:
name: 'WebTehRazrabotka LLC'
url: 'https://webdatastats.com/'

- regex: 'parse.ly scraper'
name: 'parse.ly'
category: 'Crawler'
url: 'https://www.parse.ly/help/integration/crawler'
producer:
name: 'Parsely, Inc.'
url: 'https://www.parse.ly/'

- regex: 'Nimbostratus-Bot'
name: 'Nimbostratus Bot'
category: 'Site Monitor'
url: 'http://cloudsystemnetworks.com'

- regex: 'HeartRails_Capture/\d'
name: 'Heart Rails Capture'
category: 'Service Agent'
url: 'http://capture.heartrails.com'

- regex: 'Project-Resonance'
name: 'Project Resonance'
category: 'Crawler'
url: 'http://project-resonance.com'

- regex: 'DataXu/\d'
name: 'DataXu'
category: 'Service Agent'
url: 'https://advertising.roku.com/dataxu'
producer:
name: 'Roku, Inc.'
url: 'https://roku.com'

# Generic detections

- regex: '[a-z0-9\-_]*((?<!cu|power[ _]|m[ _])bot(?![ _]TAB|[ _]?5[0-9])|crawler|crawl|checker|archiver|transcoder|spider)([^a-z]|$)'
Expand Down
8 changes: 7 additions & 1 deletion src/main/resources/regexes/client/browser_engine.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
- regex: 'Trident'
name: 'Trident'

- regex: 'Blink'
- regex: '(?<!SmartHu)Blink'
name: 'Blink'

- regex: '(?:Apple)?WebKit'
Expand All @@ -23,6 +23,9 @@
- regex: 'Presto'
name: 'Presto'

- regex: 'Goanna'
name: 'Goanna'

- regex: '(?<!like )Gecko'
name: 'Gecko'

Expand All @@ -37,3 +40,6 @@

- regex: 'Goanna'
name: 'Goanna'

- regex: 'Ekioh(?:Flow)?'
name: 'EkiohFlow'
20 changes: 20 additions & 0 deletions src/main/resources/regexes/client/browser_family.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Chrome:
- AVG Secure Browser
- Beaker Browser
- Beamrise
- Borealis Navigator
- Brave
- CCleaner
- Coc Coc
Expand Down Expand Up @@ -87,6 +88,21 @@ Chrome:
- Colibri
- Quark
- Yaani Browser
- Japan Browser
- Monument Browser
- Slimjet
- 7Star
- MxNitro
- Phoenix Browser
- UR Browser
- NFS Browser
- Stargon
- Seewo Browser
- Chim Lac
- Seraphic Sraf
- Odin
- SFive
- T-Browser

Firefox:
- Basilisk
Expand Down Expand Up @@ -119,11 +135,15 @@ Firefox:
- Safe Exam Browser
- Zvu
- Iceweasel
- PrivacyWall
- Borealis Navigator

Internet Explorer:
- Internet Explorer
- IE Mobile
- Microsoft Edge
- Crazy Browser
-

Konqueror:
- Konqueror
Expand Down
Loading

0 comments on commit bc60d53

Please sign in to comment.