From 36ce640176b9e8254ce0e3d22d4f0e5729372156 Mon Sep 17 00:00:00 2001 From: Giuseppe Criscione <18699708+giuscris@users.noreply.github.com> Date: Thu, 4 Apr 2024 17:22:45 +0200 Subject: [PATCH] Escape dots in patterns (#338) Co-authored-by: Martin Monperrus --- crawler-user-agents.json | 48 ++++++++++++++++++++-------------------- validate.py | 6 +++++ 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/crawler-user-agents.json b/crawler-user-agents.json index 5d2110c..f4ca930 100644 --- a/crawler-user-agents.json +++ b/crawler-user-agents.json @@ -405,7 +405,7 @@ } , { - "pattern": "grub.org", + "pattern": "grub\\.org", "instances": [ "Mozilla/4.0 (compatible; grub-client-0.3.0; Crawl your own stuff with http://grub.org)", "Mozilla/4.0 (compatible; grub-client-1.0.4; Crawl your own stuff with http://grub.org)", @@ -855,7 +855,7 @@ } , { - "pattern": "Mail.RU_Bot", + "pattern": "Mail\\.RU_Bot", "addition_date": "2011/04/27", "instances": [ "Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)", @@ -914,7 +914,7 @@ } , { - "pattern": "europarchive.org", + "pattern": "europarchive\\.org", "addition_date": "2011/06/21", "url": "", "instances": [ @@ -923,7 +923,7 @@ } , { - "pattern": "NerdByNature.Bot", + "pattern": "NerdByNature\\.Bot", "addition_date": "2011/07/12", "url": "http://www.nerdbynature.net/bot", "instances": [ @@ -1299,7 +1299,7 @@ } , { - "pattern": "web-archive-net.com.bot", + "pattern": "web-archive-net\\.com\\.bot", "instances": [] } , @@ -1359,13 +1359,13 @@ } , { - "pattern": "ip-web-crawler.com", + "pattern": "ip-web-crawler\\.com", "addition_date": "2013/03/22", "instances": [] } , { - "pattern": "siteexplorer.info", + "pattern": "siteexplorer\\.info", "addition_date": "2013/05/01", "instances": [ "Mozilla/5.0 (compatible; SiteExplorer/1.0b; +http://siteexplorer.info/)", @@ -1493,7 +1493,7 @@ } , { - "pattern": "g00g1e.net", + "pattern": "g00g1e\\.net", "addition_date": "2014/04/01", "url": "http://www.g00g1e.net/", "instances": [] @@ -1584,7 +1584,7 @@ } , { - "pattern": "bnf.fr_bot", + "pattern": "bnf\\.fr_bot", "addition_date": "2014/11/18", "url": "http://www.bnf.fr/fr/outils/a.dl_web_capture_robot.html", "instances": [ @@ -1715,7 +1715,7 @@ } , { - "pattern": "archive.org_bot", + "pattern": "archive\\.org_bot", "url": "http://www.archive.org/details/archive.org_bot", "depends_on": ["heritrix"], "instances": [ @@ -1895,7 +1895,7 @@ } , { - "pattern": "collection@infegy.com", + "pattern": "collection@infegy\\.com", "url": "http://infegy.com/", "instances": [ "Mozilla/5.0 (compatible) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36 collection@infegy.com" @@ -2179,7 +2179,7 @@ } , { - "pattern": "pinterest.com.bot", + "pattern": "pinterest\\.com\\/bot", "addition_date": "2017/03/03", "instances": [ "Mozilla/5.0 (compatible; Pinterestbot/1.0; +http://www.pinterest.com/bot.html)", @@ -2805,7 +2805,7 @@ } , { - "pattern": "Traackr.com", + "pattern": "Traackr\\.com", "addition_date": "2017/11/02", "url": "Traackr.com", "instances": [ @@ -2941,7 +2941,7 @@ } , { - "pattern": "filterdb.iss.net\\/crawler", + "pattern": "filterdb\\.iss\\.net\\/crawler", "addition_date": "2018/03/16", "instances": [ "Mozilla/5.0 (compatible; oBot/2.3.1; +http://filterdb.iss.net/crawler/)" @@ -3210,7 +3210,7 @@ } , { - "pattern": "Bot.AraTurka.com", + "pattern": "Bot\\.AraTurka\\.com", "addition_date": "2018/06/27", "instances": [ "Bot.AraTurka.com/0.0.1" @@ -3219,7 +3219,7 @@ } , { - "pattern": "bot-pge.chlooe.com", + "pattern": "bot-pge\\.chlooe\\.com", "addition_date": "2018/06/27", "instances": [ "bot-pge.chlooe.com/1.0.0 (+http://www.chlooe.com/)" @@ -3397,7 +3397,7 @@ } , { - "pattern": "Siteimprove.com", + "pattern": "Siteimprove\\.com", "addition_date": "2018/06/22", "instances": [ "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) LinkCheck by Siteimprove.com", @@ -3506,7 +3506,7 @@ } , { - "pattern": "PR-CY.RU", + "pattern": "PR-CY\\.RU", "addition_date": "2018/08/30", "instances": [ "Mozilla/5.0 (compatible; PR-CY.RU; + https://a.pr-cy.ru)" @@ -3827,7 +3827,7 @@ ] }, { - "pattern": "Dataprovider.com", + "pattern": "Dataprovider\\.com", "addition_date": "2018/11/24", "instances": [ "Mozilla/5.0 (compatible; Dataprovider.com)" @@ -3843,7 +3843,7 @@ "url": "http://www.grouphigh.com/" }, { - "pattern": "theoldreader.com", + "pattern": "theoldreader\\.com", "addition_date": "2018/12/02", "instances": [ "Mozilla/5.0 (compatible; theoldreader.com)" @@ -3879,7 +3879,7 @@ } , { - "pattern": "2ip.ru", + "pattern": "2ip\\.ru", "addition_date": "2019/02/12", "instances": [ "2ip.ru CMS Detector (https://2ip.ru/cms/)" @@ -5000,7 +5000,7 @@ "url": "https://metrics-tools.de/robot.html" }, { - "pattern": "hyscore.io", + "pattern": "hyscore\\.io", "addition_date": "2023/09/08", "instances": [ "Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12F70 Safari/600.1. 4 (compatible; HyScore/1.0; +https://hyscore.io/crawler/)" @@ -5104,7 +5104,7 @@ "url": "https://torus.company/bot.html" }, { - "pattern": "sempi.tech", + "pattern": "sempi\\.tech", "addition_date": "2023/09/08", "instances": [ "Mozilla/5.0 (compatible; Semanticbot/1.0; +http://sempi.tech/bot.html)" @@ -5160,7 +5160,7 @@ "url": "https://opengraphcheck.com" }, { - "pattern": "developers.google.com\\/\\+\\/web\\/snippet", + "pattern": "developers\\.google\\.com\\/\\+\\/web\\/snippet", "addition_date": "2023/09/08", "instances": [ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)", diff --git a/validate.py b/validate.py index 94f7252..8f9c134 100644 --- a/validate.py +++ b/validate.py @@ -55,6 +55,12 @@ def main(): if re.search('[^\\\\]/', pattern): raise ValueError('Pattern {!r} has an unescaped slash character'.format(pattern)) + # check that no pattern contains unescaped dot . + for entry in json_data: + pattern = entry['pattern'] + if re.search('[^\\\\]\\.', pattern): + raise ValueError('Pattern {!r} has an unescaped dot character'.format(pattern)) + # check that we match the given instances num_instances = 0 for entry in json_data: