Skip to content

Commit

Permalink
Escape dots in patterns (#338)
Browse files Browse the repository at this point in the history
Co-authored-by: Martin Monperrus <[email protected]>
  • Loading branch information
giuscris and monperrus authored Apr 4, 2024
1 parent a9b2d45 commit 36ce640
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 24 deletions.
48 changes: 24 additions & 24 deletions crawler-user-agents.json
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@
}
,
{
"pattern": "grub.org",
"pattern": "grub\\.org",
"instances": [
"Mozilla/4.0 (compatible; grub-client-0.3.0; Crawl your own stuff with http://grub.org)",
"Mozilla/4.0 (compatible; grub-client-1.0.4; Crawl your own stuff with http://grub.org)",
Expand Down Expand Up @@ -855,7 +855,7 @@
}
,
{
"pattern": "Mail.RU_Bot",
"pattern": "Mail\\.RU_Bot",
"addition_date": "2011/04/27",
"instances": [
"Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)",
Expand Down Expand Up @@ -914,7 +914,7 @@
}
,
{
"pattern": "europarchive.org",
"pattern": "europarchive\\.org",
"addition_date": "2011/06/21",
"url": "",
"instances": [
Expand All @@ -923,7 +923,7 @@
}
,
{
"pattern": "NerdByNature.Bot",
"pattern": "NerdByNature\\.Bot",
"addition_date": "2011/07/12",
"url": "http://www.nerdbynature.net/bot",
"instances": [
Expand Down Expand Up @@ -1299,7 +1299,7 @@
}
,
{
"pattern": "web-archive-net.com.bot",
"pattern": "web-archive-net\\.com\\.bot",
"instances": []
}
,
Expand Down Expand Up @@ -1359,13 +1359,13 @@
}
,
{
"pattern": "ip-web-crawler.com",
"pattern": "ip-web-crawler\\.com",
"addition_date": "2013/03/22",
"instances": []
}
,
{
"pattern": "siteexplorer.info",
"pattern": "siteexplorer\\.info",
"addition_date": "2013/05/01",
"instances": [
"Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)",
Expand Down Expand Up @@ -1493,7 +1493,7 @@
}
,
{
"pattern": "g00g1e.net",
"pattern": "g00g1e\\.net",
"addition_date": "2014/04/01",
"url": "http://www.g00g1e.net/",
"instances": []
Expand Down Expand Up @@ -1584,7 +1584,7 @@
}
,
{
"pattern": "bnf.fr_bot",
"pattern": "bnf\\.fr_bot",
"addition_date": "2014/11/18",
"url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html",
"instances": [
Expand Down Expand Up @@ -1715,7 +1715,7 @@
}
,
{
"pattern": "archive.org_bot",
"pattern": "archive\\.org_bot",
"url": "http://www.archive.org/details/archive.org_bot",
"depends_on": ["heritrix"],
"instances": [
Expand Down Expand Up @@ -1895,7 +1895,7 @@
}
,
{
"pattern": "[email protected]",
"pattern": "collection@infegy\\.com",
"url": "http://infegy.com/",
"instances": [
"Mozilla/5.0 (compatible) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 [email protected]"
Expand Down Expand Up @@ -2179,7 +2179,7 @@
}
,
{
"pattern": "pinterest.com.bot",
"pattern": "pinterest\\.com\\/bot",
"addition_date": "2017/03/03",
"instances": [
"Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)",
Expand Down Expand Up @@ -2805,7 +2805,7 @@
}
,
{
"pattern": "Traackr.com",
"pattern": "Traackr\\.com",
"addition_date": "2017/11/02",
"url": "Traackr.com",
"instances": [
Expand Down Expand Up @@ -2941,7 +2941,7 @@
}
,
{
"pattern": "filterdb.iss.net\\/crawler",
"pattern": "filterdb\\.iss\\.net\\/crawler",
"addition_date": "2018/03/16",
"instances": [
"Mozilla/5.0 (compatible; oBot/2.3.1; +http://filterdb.iss.net/crawler/)"
Expand Down Expand Up @@ -3210,7 +3210,7 @@
}
,
{
"pattern": "Bot.AraTurka.com",
"pattern": "Bot\\.AraTurka\\.com",
"addition_date": "2018/06/27",
"instances": [
"Bot.AraTurka.com/0.0.1"
Expand All @@ -3219,7 +3219,7 @@
}
,
{
"pattern": "bot-pge.chlooe.com",
"pattern": "bot-pge\\.chlooe\\.com",
"addition_date": "2018/06/27",
"instances": [
"bot-pge.chlooe.com/1.0.0 (+http://www.chlooe.com/)"
Expand Down Expand Up @@ -3397,7 +3397,7 @@
}
,
{
"pattern": "Siteimprove.com",
"pattern": "Siteimprove\\.com",
"addition_date": "2018/06/22",
"instances": [
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) LinkCheck by Siteimprove.com",
Expand Down Expand Up @@ -3506,7 +3506,7 @@
}
,
{
"pattern": "PR-CY.RU",
"pattern": "PR-CY\\.RU",
"addition_date": "2018/08/30",
"instances": [
"Mozilla/5.0 (compatible; PR-CY.RU; + https://a.pr-cy.ru)"
Expand Down Expand Up @@ -3827,7 +3827,7 @@
]
},
{
"pattern": "Dataprovider.com",
"pattern": "Dataprovider\\.com",
"addition_date": "2018/11/24",
"instances": [
"Mozilla/5.0 (compatible; Dataprovider.com)"
Expand All @@ -3843,7 +3843,7 @@
"url": "http://www.grouphigh.com/"
},
{
"pattern": "theoldreader.com",
"pattern": "theoldreader\\.com",
"addition_date": "2018/12/02",
"instances": [
"Mozilla/5.0 (compatible; theoldreader.com)"
Expand Down Expand Up @@ -3879,7 +3879,7 @@
}
,
{
"pattern": "2ip.ru",
"pattern": "2ip\\.ru",
"addition_date": "2019/02/12",
"instances": [
"2ip.ru CMS Detector (https://2ip.ru/cms/)"
Expand Down Expand Up @@ -5000,7 +5000,7 @@
"url": "https://metrics-tools.de/robot.html"
},
{
"pattern": "hyscore.io",
"pattern": "hyscore\\.io",
"addition_date": "2023/09/08",
"instances": [
"Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1. 4 (compatible; HyScore/1.0; +https://hyscore.io/crawler/)"
Expand Down Expand Up @@ -5104,7 +5104,7 @@
"url": "https://torus.company/bot.html"
},
{
"pattern": "sempi.tech",
"pattern": "sempi\\.tech",
"addition_date": "2023/09/08",
"instances": [
"Mozilla/5.0 (compatible; Semanticbot/1.0; +http://sempi.tech/bot.html)"
Expand Down Expand Up @@ -5160,7 +5160,7 @@
"url": "https://opengraphcheck.com"
},
{
"pattern": "developers.google.com\\/\\+\\/web\\/snippet",
"pattern": "developers\\.google\\.com\\/\\+\\/web\\/snippet",
"addition_date": "2023/09/08",
"instances": [
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)",
Expand Down
6 changes: 6 additions & 0 deletions validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ def main():
if re.search('[^\\\\]/', pattern):
raise ValueError('Pattern {!r} has an unescaped slash character'.format(pattern))

# check that no pattern contains unescaped dot .
for entry in json_data:
pattern = entry['pattern']
if re.search('[^\\\\]\\.', pattern):
raise ValueError('Pattern {!r} has an unescaped dot character'.format(pattern))

# check that we match the given instances
num_instances = 0
for entry in json_data:
Expand Down

0 comments on commit 36ce640

Please sign in to comment.