Skip to content

Commit

Permalink
fix: improve Python harness (#368)
Browse files Browse the repository at this point in the history
  • Loading branch information
jribbens authored Sep 6, 2024
1 parent 4683176 commit e7f72bc
Show file tree
Hide file tree
Showing 7 changed files with 93 additions and 21 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ jobs:
with:
node-version: 20
- run: node format.js --check
- run: pip3 install -r requirements.txt
- run: python3 -mpip install -U pip
- run: python3 -mpip install -e .[dev]
- run: py.test -vv
- run: python3 validate.py
- run: php validate.php
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.cache
__pycache__
/vendor/
/env/
*.egg-info/
18 changes: 16 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,24 @@ Then:

```python
import crawleruseragents
if crawleruseragents.is_crawler("googlebot/"):
if crawleruseragents.is_crawler("Googlebot/"):
# do something
```

or:

```python
import crawleruseragents
indices = crawleruseragents.matching_crawlers("bingbot/2.0")
print("crawlers' indices:", indices)
print(
"crawler's URL:",
crawleruseragents.CRAWLER_USER_AGENTS_DATA[indices[0]]["url"]
)
```

Note that `matching_crawlers` is much slower than `is_crawler`, if the given User-Agent does indeed match any crawlers.

### Go

Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
Expand All @@ -70,7 +84,7 @@ func main() {

indices := agents.MatchingCrawlers(userAgent)
fmt.Println("crawlers' indices:", indices)
fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
fmt.Println("crawler's URL:", agents.Crawlers[indices[0]].URL)
}
```

Expand Down
27 changes: 18 additions & 9 deletions __init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,24 @@ def load_json():


CRAWLER_USER_AGENTS_DATA = load_json()
CRAWLER_USER_AGENTS_REGEXP = re.compile(
"|".join(i["pattern"] for i in CRAWLER_USER_AGENTS_DATA)
)


def is_crawler(user_agent: str) -> bool:
for crawler_user_agent in CRAWLER_USER_AGENTS_DATA:
if re.search(crawler_user_agent["pattern"], user_agent, re.IGNORECASE):
return True
return False


def is_crawler2(s):
regexp = re.compile("|".join([i["pattern"] for i in CRAWLER_USER_AGENTS_DATA]))
return regexp.search(s) is not None
"""Return True if the given User-Agent matches a known crawler."""
return bool(CRAWLER_USER_AGENTS_REGEXP.search(user_agent))


def matching_crawlers(user_agent: str) -> list[int]:
"""
Return a list of the indices in CRAWLER_USER_AGENTS_DATA of any crawlers
matching the given User-Agent.
"""
result = []
if is_crawler(user_agent):
for num, crawler_user_agent in enumerate(CRAWLER_USER_AGENTS_DATA):
if re.search(crawler_user_agent["pattern"], user_agent):
result.append(num)
return result
16 changes: 16 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,22 @@ authors = [

readme = "README.md"

[project.optional-dependencies]
dev = [
"attrs==23.2.0",
"iniconfig==2.0.0",
"jsonschema==4.22.0",
"jsonschema-specifications==2023.12.1",
"packaging==24.0",
"pluggy==1.5.0",
"pytest==8.2.0",
"referencing==0.35.0",
"rpds-py==0.18.0",
]

[project.urls]
Homepage = "https://github.com/monperrus/crawler-user-agents"

[tool.setuptools]
package-dir = {"crawleruseragents" = "."}

Expand Down
9 changes: 0 additions & 9 deletions requirements.txt

This file was deleted.

39 changes: 39 additions & 0 deletions test_harness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
Simple tests for python harness
Usage:
$ pytest test_harness.py
"""
from crawleruseragents import is_crawler, matching_crawlers


def test_match():
assert is_crawler("test Googlebot/2.0 test") is True


def test_nomatch():
assert is_crawler("!!!!!!!!!!!!") is False


def test_case():
assert is_crawler("test googlebot/2.0 test") is False


def test_matching_crawlers_match():
result = matching_crawlers("test Googlebot/2.0 test")
assert isinstance(result, list)
assert len(result) > 0
assert all(isinstance(val, int) for val in result)


def test_matching_crawlers_nomatch():
result = matching_crawlers("!!!!!!!!!!!!")
assert isinstance(result, list)
assert len(result) == 0


def test_matching_crawlers_case():
result = matching_crawlers("test googlebot/2.0 test")
assert isinstance(result, list)
assert len(result) == 0

0 comments on commit e7f72bc

Please sign in to comment.