diff --git a/.github/workflows/ci-validation.yml b/.github/workflows/ci-validation.yml index 0888568..38c5b50 100644 --- a/.github/workflows/ci-validation.yml +++ b/.github/workflows/ci-validation.yml @@ -23,3 +23,4 @@ jobs: - run: py.test -vv - run: python3 validate.py - run: php validate.php + - run: go test diff --git a/README.md b/README.md index b7f050c..49e7834 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,40 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f * JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }` * PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...` * Python: `if re.search(entry['pattern'], ua): ...` +* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents), + it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), + functions `IsCrawler` and `MatchingCrawlers`. + +Example of Go program: + +```go +package main + +import ( + "fmt" + + "github.com/monperrus/crawler-user-agents" +) + +func main() { + userAgent := "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)" + + isCrawler := agents.IsCrawler(userAgent) + fmt.Println("isCrawler:", isCrawler) + + indices := agents.MatchingCrawlers(userAgent) + fmt.Println("crawlers' indices:", indices) + fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL) +} +``` + +Output: + +``` +isCrawler: true +crawlers' indices: [237] +crawler' URL: https://discordapp.com +``` ## Contributing @@ -66,7 +100,6 @@ There are a few wrapper libraries that use this data to detect bots: * [Voight-Kampff](https://github.com/biola/Voight-Kampff) (Ruby) * [isbot](https://github.com/Hentioe/isbot) (Ruby) * [crawlers](https://github.com/Olical/crawlers) (Clojure) - * [crawlerflagger](https://godoc.org/go.kelfa.io/kelfa/pkg/crawlerflagger) (Go) * [isBot](https://github.com/omrilotan/isbot) (Node.JS) Other systems for spotting robots, crawlers, and spiders that you may want to consider are: diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..54db180 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/monperrus/crawler-user-agents + +go 1.19 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..e69de29 diff --git a/validate.go b/validate.go new file mode 100644 index 0000000..41ab9d1 --- /dev/null +++ b/validate.go @@ -0,0 +1,110 @@ +package agents + +import ( + _ "embed" + "encoding/json" + "fmt" + "regexp" + "time" +) + +//go:embed crawler-user-agents.json +var crawlersJson []byte + +// Crawler contains information about one crawler. +type Crawler struct { + // Regexp of User Agent of the crawler. + Pattern string `json:"pattern"` + + // Discovery date. + AdditionDate time.Time `json:"addition_date"` + + // Official url of the robot. + URL string `json:"url"` + + // Examples of full User Agent strings. + Instances []string `json:"instances"` +} + +// Private time needed to convert addition_date from/to the format used in JSON. +type jsonCrawler struct { + Pattern string `json:"pattern"` + AdditionDate string `json:"addition_date"` + URL string `json:"url"` + Instances []string `json:"instances"` +} + +const timeLayout = "2006/01/02" + +func (c Crawler) MarshalJSON() ([]byte, error) { + jc := jsonCrawler{ + Pattern: c.Pattern, + AdditionDate: c.AdditionDate.Format(timeLayout), + URL: c.URL, + Instances: c.Instances, + } + return json.Marshal(jc) +} + +func (c *Crawler) UnmarshalJSON(b []byte) error { + var jc jsonCrawler + if err := json.Unmarshal(b, &jc); err != nil { + return err + } + + c.Pattern = jc.Pattern + c.URL = jc.URL + c.Instances = jc.Instances + + if c.Pattern == "" { + return fmt.Errorf("empty pattern in record %s", string(b)) + } + + if jc.AdditionDate != "" { + tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC) + if err != nil { + return err + } + c.AdditionDate = tim + } + + return nil +} + +// The list of crawlers, built from contents of crawler-user-agents.json. +var Crawlers = func() []Crawler { + var crawlers []Crawler + if err := json.Unmarshal(crawlersJson, &crawlers); err != nil { + panic(err) + } + return crawlers +}() + +var regexps = func() []*regexp.Regexp { + regexps := make([]*regexp.Regexp, len(Crawlers)) + for i, crawler := range Crawlers { + regexps[i] = regexp.MustCompile(crawler.Pattern) + } + return regexps +}() + +// Returns if User Agent string matches any of crawler patterns. +func IsCrawler(userAgent string) bool { + for _, re := range regexps { + if re.MatchString(userAgent) { + return true + } + } + return false +} + +// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. +func MatchingCrawlers(userAgent string) []int { + indices := []int{} + for i, re := range regexps { + if re.MatchString(userAgent) { + indices = append(indices, i) + } + } + return indices +} diff --git a/validate_test.go b/validate_test.go new file mode 100644 index 0000000..6812ad2 --- /dev/null +++ b/validate_test.go @@ -0,0 +1,121 @@ +package agents + +import ( + "encoding/json" + "fmt" + "net/http" + "testing" +) + +func contains(list []int, value int) bool { + for _, elem := range list { + if elem == value { + return true + } + } + return false +} + +func TestPatterns(t *testing.T) { + // Loading all crawlers with go:embed + // some validation happens in UnmarshalJSON. + allCrawlers := Crawlers + + // There are at least 10 crawlers. + if len(allCrawlers) < 10 { + t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers)) + } + + if IsCrawler(browserUA) { + t.Errorf("Browser UA %q was detected as a crawler.", browserUA) + } + if len(MatchingCrawlers(browserUA)) != 0 { + t.Errorf("MatchingCrawlers found crawlers matching Browser UA %q.", browserUA) + } + + for i, crawler := range allCrawlers { + t.Run(crawler.Pattern, func(t *testing.T) { + fmt.Println(crawler.Pattern) + + for _, instance := range crawler.Instances { + if !IsCrawler(instance) { + t.Errorf("Instance %q is not detected as a crawler.", instance) + } + hits := MatchingCrawlers(instance) + if !contains(hits, i) { + t.Errorf("Crawler with index %d (pattern %q) is not in the list returned by MatchingCrawlers(%q): %v.", i, crawler.Pattern, instance, hits) + } + } + }) + } +} + +func TestFalseNegatives(t *testing.T) { + const browsersURL = "https://raw.githubusercontent.com/microlinkhq/top-user-agents/master/src/index.json" + resp, err := http.Get(browsersURL) + if err != nil { + t.Fatalf("Failed to fetch the list of browser User Agents from %s: %v.", browsersURL, err) + } + + t.Cleanup(func() { + if err := resp.Body.Close(); err != nil { + t.Fatal(err) + } + }) + + var browsers []string + if err := json.NewDecoder(resp.Body).Decode(&browsers); err != nil { + t.Fatalf("Failed to parse the list of browser User Agents: %v.", err) + } + + for _, userAgent := range browsers { + if IsCrawler(userAgent) { + t.Errorf("Browser User Agent %q is recognized as a crawler.", userAgent) + } + indices := MatchingCrawlers(userAgent) + if len(indices) != 0 { + t.Errorf("Browser User Agent %q matches with crawlers %v.", userAgent, indices) + } + } +} + +const ( + crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/" + browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36" +) + +func BenchmarkIsCrawlerPositive(b *testing.B) { + b.SetBytes(int64(len(crawlerUA))) + for n := 0; n < b.N; n++ { + if !IsCrawler(crawlerUA) { + b.Fail() + } + } +} + +func BenchmarkMatchingCrawlersPositive(b *testing.B) { + b.SetBytes(int64(len(crawlerUA))) + for n := 0; n < b.N; n++ { + if len(MatchingCrawlers(crawlerUA)) == 0 { + b.Fail() + } + } +} + +func BenchmarkIsCrawlerNegative(b *testing.B) { + b.SetBytes(int64(len(browserUA))) + for n := 0; n < b.N; n++ { + if IsCrawler(browserUA) { + b.Fail() + } + } +} + +func BenchmarkMatchingCrawlersNegative(b *testing.B) { + b.SetBytes(int64(len(browserUA))) + for n := 0; n < b.N; n++ { + if len(MatchingCrawlers(browserUA)) != 0 { + b.Fail() + } + } +}