Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Golang package #348

Merged
merged 23 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ jobs:
- run: py.test -vv
- run: python3 validate.py
- run: php validate.php
- run: go test
35 changes: 34 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,40 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f
* JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }`
* PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...`
* Python: `if re.search(entry['pattern'], ua): ...`
* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents),
it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`),
functions `IsCrawler` and `MatchingCrawlers`.

Example of Go program:

```go
package main

import (
"fmt"

"github.com/monperrus/crawler-user-agents"
)

func main() {
userAgent := "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)"

isCrawler := agents.IsCrawler(userAgent)
fmt.Println("isCrawler:", isCrawler)

indices := agents.MatchingCrawlers(userAgent)
fmt.Println("crawlers' indices:", indices)
fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL)
}
```

Output:

```
isCrawler: true
crawlers' indices: [237]
crawler' URL: https://discordapp.com
```

## Contributing

Expand Down Expand Up @@ -66,7 +100,6 @@ There are a few wrapper libraries that use this data to detect bots:
* [Voight-Kampff](https://github.com/biola/Voight-Kampff) (Ruby)
* [isbot](https://github.com/Hentioe/isbot) (Ruby)
* [crawlers](https://github.com/Olical/crawlers) (Clojure)
* [crawlerflagger](https://godoc.org/go.kelfa.io/kelfa/pkg/crawlerflagger) (Go)
* [isBot](https://github.com/omrilotan/isbot) (Node.JS)

Other systems for spotting robots, crawlers, and spiders that you may want to consider are:
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/monperrus/crawler-user-agents

go 1.19
Empty file added go.sum
Empty file.
110 changes: 110 additions & 0 deletions validate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package agents

import (
_ "embed"
"encoding/json"
"fmt"
"regexp"
"time"
)

//go:embed crawler-user-agents.json
var crawlersJson []byte

// Crawler contains information about one crawler.
type Crawler struct {
// Regexp of User Agent of the crawler.
Pattern string `json:"pattern"`

// Discovery date.
AdditionDate time.Time `json:"addition_date"`

// Official url of the robot.
URL string `json:"url"`

// Examples of full User Agent strings.
Instances []string `json:"instances"`
}

// Private time needed to convert addition_date from/to the format used in JSON.
type jsonCrawler struct {
Pattern string `json:"pattern"`
AdditionDate string `json:"addition_date"`
URL string `json:"url"`
Instances []string `json:"instances"`
}

const timeLayout = "2006/01/02"

func (c Crawler) MarshalJSON() ([]byte, error) {
jc := jsonCrawler{
Pattern: c.Pattern,
AdditionDate: c.AdditionDate.Format(timeLayout),
URL: c.URL,
Instances: c.Instances,
}
return json.Marshal(jc)
}

func (c *Crawler) UnmarshalJSON(b []byte) error {
var jc jsonCrawler
if err := json.Unmarshal(b, &jc); err != nil {
return err
}

c.Pattern = jc.Pattern
c.URL = jc.URL
c.Instances = jc.Instances

if c.Pattern == "" {
return fmt.Errorf("empty pattern in record %s", string(b))
}

if jc.AdditionDate != "" {
tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC)
if err != nil {
return err
}
c.AdditionDate = tim
}

return nil
}

// The list of crawlers, built from contents of crawler-user-agents.json.
var Crawlers = func() []Crawler {
var crawlers []Crawler
if err := json.Unmarshal(crawlersJson, &crawlers); err != nil {
panic(err)
}
return crawlers
}()

var regexps = func() []*regexp.Regexp {
regexps := make([]*regexp.Regexp, len(Crawlers))
for i, crawler := range Crawlers {
regexps[i] = regexp.MustCompile(crawler.Pattern)
}
return regexps
}()

// Returns if User Agent string matches any of crawler patterns.
func IsCrawler(userAgent string) bool {
for _, re := range regexps {
if re.MatchString(userAgent) {
return true
}
}
return false
}

// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
func MatchingCrawlers(userAgent string) []int {
indices := []int{}
for i, re := range regexps {
if re.MatchString(userAgent) {
indices = append(indices, i)
}
}
return indices
}
121 changes: 121 additions & 0 deletions validate_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package agents

import (
"encoding/json"
"fmt"
"net/http"
"testing"
)

func contains(list []int, value int) bool {
for _, elem := range list {
if elem == value {
return true
}
}
return false
}

func TestPatterns(t *testing.T) {
// Loading all crawlers with go:embed
// some validation happens in UnmarshalJSON.
allCrawlers := Crawlers

// There are at least 10 crawlers.
if len(allCrawlers) < 10 {
t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers))
}

if IsCrawler(browserUA) {
t.Errorf("Browser UA %q was detected as a crawler.", browserUA)
}
if len(MatchingCrawlers(browserUA)) != 0 {
t.Errorf("MatchingCrawlers found crawlers matching Browser UA %q.", browserUA)
}

for i, crawler := range allCrawlers {
t.Run(crawler.Pattern, func(t *testing.T) {
fmt.Println(crawler.Pattern)

for _, instance := range crawler.Instances {
if !IsCrawler(instance) {
t.Errorf("Instance %q is not detected as a crawler.", instance)
}
hits := MatchingCrawlers(instance)
if !contains(hits, i) {
t.Errorf("Crawler with index %d (pattern %q) is not in the list returned by MatchingCrawlers(%q): %v.", i, crawler.Pattern, instance, hits)
}
}
})
}
}

func TestFalseNegatives(t *testing.T) {
const browsersURL = "https://raw.githubusercontent.com/microlinkhq/top-user-agents/master/src/index.json"
resp, err := http.Get(browsersURL)
if err != nil {
t.Fatalf("Failed to fetch the list of browser User Agents from %s: %v.", browsersURL, err)
}

t.Cleanup(func() {
if err := resp.Body.Close(); err != nil {
t.Fatal(err)
}
})

var browsers []string
if err := json.NewDecoder(resp.Body).Decode(&browsers); err != nil {
t.Fatalf("Failed to parse the list of browser User Agents: %v.", err)
}

for _, userAgent := range browsers {
if IsCrawler(userAgent) {
t.Errorf("Browser User Agent %q is recognized as a crawler.", userAgent)
}
indices := MatchingCrawlers(userAgent)
if len(indices) != 0 {
t.Errorf("Browser User Agent %q matches with crawlers %v.", userAgent, indices)
}
}
}

const (
crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/"
browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36"
)

func BenchmarkIsCrawlerPositive(b *testing.B) {
b.SetBytes(int64(len(crawlerUA)))
for n := 0; n < b.N; n++ {
if !IsCrawler(crawlerUA) {
b.Fail()
}
}
}

func BenchmarkMatchingCrawlersPositive(b *testing.B) {
b.SetBytes(int64(len(crawlerUA)))
for n := 0; n < b.N; n++ {
if len(MatchingCrawlers(crawlerUA)) == 0 {
b.Fail()
}
}
}

func BenchmarkIsCrawlerNegative(b *testing.B) {
b.SetBytes(int64(len(browserUA)))
for n := 0; n < b.N; n++ {
if IsCrawler(browserUA) {
b.Fail()
}
}
}

func BenchmarkMatchingCrawlersNegative(b *testing.B) {
b.SetBytes(int64(len(browserUA)))
for n := 0; n < b.N; n++ {
if len(MatchingCrawlers(browserUA)) != 0 {
b.Fail()
}
}
}