From 923130b39db807eef1d3644c6f5f43f3582896c5 Mon Sep 17 00:00:00 2001 From: Martin Monperrus Date: Sat, 18 May 2024 11:58:11 +0200 Subject: [PATCH] add usage harness in Python --- README.md | 23 ++++++++++++++++------- __init__.py | 22 ++++++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 __init__.py diff --git a/README.md b/README.md index db421e1..248742e 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,15 @@ This repository contains a list of of HTTP user-agents used by robots, crawlers, * Go package: * PyPi package: +Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library: + ## Install ### Direct download Download the [`crawler-user-agents.json` file](https://raw.githubusercontent.com/monperrus/crawler-user-agents/master/crawler-user-agents.json) from this repository directly. -### Npm / Yarn +### Javascript crawler-user-agents is deployed on npmjs.com: @@ -31,14 +33,21 @@ const crawlers = require('crawler-user-agents'); console.log(crawlers); ``` -## Usage +### Python -Each `pattern` is a regular expression. It should work out-of-the-box wih your favorite regex library: +Install with `pip install crawler-user-agents` + +Then: + +```python +import crawleruseragents +if crawleruseragents.is_crawler("googlebot/"): + # do something +``` + +### Go -* JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }` -* PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...` -* Python: `if re.search(entry['pattern'], ua): ...` -* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents), +Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents), it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), functions `IsCrawler` and `MatchingCrawlers`. diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..6ffe061 --- /dev/null +++ b/__init__.py @@ -0,0 +1,22 @@ +import crawleruseragents +import re +import json +from importlib import resources + +def load_json(): + return json.loads(resources.read_text(crawleruseragents,"crawler-user-agents.json")) + +DATA = load_json() + +def is_crawler(s): + # print(s) + for i in DATA: + test=re.search(i["pattern"],s,re.IGNORECASE) + if test: + return True + return False + +def is_crawler2(s): + regexp = re.compile("|".join([i["pattern"] for i in DATA])) + return regexp.search(s) != None +