Skip to content

Commit

Permalink
patterns json: add support for GroupMe bot (#359)
Browse files Browse the repository at this point in the history
  • Loading branch information
bentsi committed May 19, 2024
1 parent 923130b commit 708b496
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 17 deletions.
33 changes: 18 additions & 15 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import crawleruseragents
import re
import json
from importlib import resources

from pathlib import Path


def load_json():
return json.loads(resources.read_text(crawleruseragents,"crawler-user-agents.json"))
cwd = Path(__file__).parent
user_agents_file_path = cwd / "crawler-user-agents.json"
with user_agents_file_path.open() as patterns_file:
return json.load(patterns_file)

DATA = load_json()

def is_crawler(s):
# print(s)
for i in DATA:
test=re.search(i["pattern"],s,re.IGNORECASE)
if test:
return True
return False
CRAWLER_USER_AGENTS_DATA = load_json()

def is_crawler2(s):
regexp = re.compile("|".join([i["pattern"] for i in DATA]))
return regexp.search(s) != None

def is_crawler(user_agent: str) -> bool:
for crawler_user_agent in CRAWLER_USER_AGENTS_DATA:
if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE):
return True
return False


def is_crawler2(s):
regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]))
return regexp.search(s) is not None
9 changes: 7 additions & 2 deletions crawler-user-agents.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.96 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
"Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36"
]
}
,
},
{
"pattern": "Googlebot-Mobile",
"instances": [
Expand Down Expand Up @@ -5314,5 +5313,11 @@
"addition_date": "2024/05/14",
"instances": ["Mozilla/5.0 (compatible; Monsidobot/2.2; +http://monsido.com/bot.html; [email protected])"],
"url": "http://monsido.com/bot.html"
},
{
"pattern": "GroupMeBot",
"addition_date": "2024/05/19",
"instances": ["GroupMeBot/1.0"],
"url": "https://groupme.com/"
}
]

0 comments on commit 708b496

Please sign in to comment.