-
-
Notifications
You must be signed in to change notification settings - Fork 247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
golang: speedup using TRIE #353
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,9 +2,13 @@ package agents | |
|
||
import ( | ||
_ "embed" | ||
"encoding/hex" | ||
"encoding/json" | ||
"fmt" | ||
"hash/maphash" | ||
"regexp" | ||
"strconv" | ||
"strings" | ||
"time" | ||
) | ||
|
||
|
@@ -80,31 +84,205 @@ var Crawlers = func() []Crawler { | |
return crawlers | ||
}() | ||
|
||
var regexps = func() []*regexp.Regexp { | ||
regexps := make([]*regexp.Regexp, len(Crawlers)) | ||
var pattern2literals = map[string][]string{ | ||
`[wW]get`: {`wget`, `Wget`}, | ||
`Ahrefs(Bot|SiteAudit)`: {`AhrefsBot`, `AhrefsSiteAudit`}, | ||
`S[eE][mM]rushBot`: {`SemrushBot`, `SeMrushBot`, `SEmrushBot`, `SEMrushBot`}, | ||
`Livelap[bB]ot`: {`Livelapbot`, `LivelapBot`}, | ||
`[pP]ingdom`: {`pingdom`, `Pingdom`}, | ||
`Bark[rR]owler`: {`Barkrowler`, `BarkRowler`}, | ||
`^Apache-HttpClient`: {`^Apache-HttpClient`}, | ||
`^LCC `: {`^LCC `}, | ||
`(^| )sentry\/`: {`^sentry/`, ` sentry/`}, | ||
`^curl`: {`^curl`}, | ||
`[Cc]urebot`: {`Curebot`, `curebot`}, | ||
`^PHP-Curl-Class`: {`^PHP-Curl-Class`}, | ||
`(^| )PTST\/`: {`^PTST/`, ` PTST/`}, | ||
`^BW\/`: {`^BW/`}, | ||
} | ||
|
||
var pattern2mainLiteral = map[string]string{ | ||
`AdsBot-Google([^-]|$)`: `AdsBot-Google`, | ||
`BlogTraffic\/\d\.\d+ Feed-Fetcher`: `BlogTraffic/`, | ||
} | ||
|
||
func analyzePattern(pattern string) (olds []string, re *regexp.Regexp) { | ||
literals, has := pattern2literals[pattern] | ||
if has { | ||
return literals, nil | ||
} | ||
|
||
re = regexp.MustCompile(pattern) | ||
prefix, complete := re.LiteralPrefix() | ||
if complete { | ||
return []string{prefix}, nil | ||
} | ||
|
||
mainLiteral, has := pattern2mainLiteral[pattern] | ||
if !has { | ||
panic("don't know what to do with pattern: " + pattern) | ||
} | ||
return []string{mainLiteral}, re | ||
} | ||
|
||
type regexpPattern struct { | ||
re *regexp.Regexp | ||
index int | ||
} | ||
|
||
type matcher struct { | ||
replacer *strings.Replacer | ||
regexps []regexpPattern | ||
} | ||
|
||
var uniqueToken = hex.EncodeToString((&maphash.Hash{}).Sum(nil)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can do the same with a short, deterministic string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A short string may match spontaneously. A deterministic string may match intentionally: someone attacking the detector can pass a text with that string followed by wrong format in it, causing a panic in the current implementation. |
||
|
||
const ( | ||
uniqueTokenLen = 2 * 8 | ||
numLen = 5 | ||
literalLabel = '-' | ||
regexpLabel = '*' | ||
) | ||
|
||
var m = func() matcher { | ||
if len(uniqueToken) != uniqueTokenLen { | ||
panic("len(uniqueToken) != uniqueTokenLen") | ||
} | ||
|
||
regexps := []regexpPattern{} | ||
oldnew := make([]string, 0, len(Crawlers)*2) | ||
|
||
// Put re-based patterns to the end to prevent AdsBot-Google from | ||
// shadowing AdsBot-Google-Mobile. | ||
var oldnew2 []string | ||
|
||
for i, crawler := range Crawlers { | ||
regexps[i] = regexp.MustCompile(crawler.Pattern) | ||
literals, re := analyzePattern(crawler.Pattern) | ||
|
||
label := literalLabel | ||
num := i | ||
if re != nil { | ||
label = regexpLabel | ||
num = len(regexps) | ||
regexps = append(regexps, regexpPattern{ | ||
re: re, | ||
index: i, | ||
}) | ||
} | ||
|
||
replaceWith := fmt.Sprintf(" %s%c%0*d ", uniqueToken, label, numLen, num) | ||
|
||
for _, literal := range literals { | ||
if re != nil { | ||
oldnew2 = append(oldnew2, literal, replaceWith) | ||
} else { | ||
oldnew = append(oldnew, literal, replaceWith) | ||
} | ||
} | ||
} | ||
oldnew = append(oldnew, oldnew2...) | ||
|
||
// Allocate another array with regexps of exact size to save memory. | ||
regexps2 := make([]regexpPattern, len(regexps)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Save memory. I added a comment in the code. |
||
copy(regexps2, regexps) | ||
|
||
r := strings.NewReplacer(oldnew...) | ||
r.Replace("") // To cause internal build process. | ||
|
||
return matcher{ | ||
replacer: r, | ||
regexps: regexps2, | ||
} | ||
return regexps | ||
}() | ||
|
||
// Returns if User Agent string matches any of crawler patterns. | ||
func IsCrawler(userAgent string) bool { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of crashing, we can return error to allow the client to handle it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about it. It would break an API, which is not great. Also I think in this case a panic is in place, because a panic means a mistake in the code [or in the config of an app / system, but not in this case]. Returning an error is for cases of mistake in the input in general (in this case, a mistake in user agent format). The panics that the code has will fire if there is a bug, a mistake in the code. |
||
for _, re := range regexps { | ||
if re.MatchString(userAgent) { | ||
// This code is mostly copy-paste of MatchingCrawlers, | ||
// but with early exit logic, so it works a but faster. | ||
|
||
text := "^" + userAgent + "$" | ||
replaced := m.replacer.Replace(text) | ||
if replaced == text { | ||
return false | ||
} | ||
|
||
for { | ||
uniquePos := strings.Index(replaced, uniqueToken) | ||
if uniquePos == -1 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can move this check to the for declaration, it's a bit cleaner There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate on this, please? for uniquePos := strings.Index(replaced, uniqueToken); uniquePos != -1; uniquePos = strings.Index(replaced, uniqueToken) {
} Like this? I don't really like it, because it copy-pastes the |
||
break | ||
} | ||
|
||
start := uniquePos + uniqueTokenLen + 1 | ||
if start+numLen >= len(replaced) { | ||
panic("corrupt replaced: " + replaced) | ||
} | ||
|
||
label := replaced[start-1] | ||
switch label { | ||
case literalLabel: | ||
return true | ||
case regexpLabel: | ||
// Rare case. Run regexp to confirm the match. | ||
indexStr := replaced[start : start+numLen] | ||
index, err := strconv.Atoi(indexStr) | ||
if err != nil { | ||
panic("corrupt replaced: " + replaced) | ||
} | ||
rp := m.regexps[index] | ||
if rp.re.MatchString(userAgent) { | ||
return true | ||
} | ||
default: | ||
panic("corrupt replaced: " + replaced) | ||
} | ||
|
||
replaced = replaced[start+numLen:] | ||
} | ||
|
||
return false | ||
} | ||
|
||
// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. | ||
func MatchingCrawlers(userAgent string) []int { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of crashing, we can return error to allow the client to handle it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See my comment above. |
||
text := "^" + userAgent + "$" | ||
replaced := m.replacer.Replace(text) | ||
if replaced == text { | ||
return []int{} | ||
} | ||
|
||
indices := []int{} | ||
for i, re := range regexps { | ||
if re.MatchString(userAgent) { | ||
indices = append(indices, i) | ||
for { | ||
uniquePos := strings.Index(replaced, uniqueToken) | ||
if uniquePos == -1 { | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can move this check to the for declaration, it's a bit cleaner There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See my comment above. |
||
} | ||
|
||
start := uniquePos + uniqueTokenLen + 1 | ||
if start+numLen >= len(replaced) { | ||
panic("corrupt replaced: " + replaced) | ||
} | ||
indexStr := replaced[start : start+numLen] | ||
index, err := strconv.Atoi(indexStr) | ||
if err != nil { | ||
panic("corrupt replaced: " + replaced) | ||
} | ||
|
||
label := replaced[start-1] | ||
switch label { | ||
case literalLabel: | ||
indices = append(indices, index) | ||
case regexpLabel: | ||
// Rare case. Run regexp to confirm the match. | ||
rp := m.regexps[index] | ||
if rp.re.MatchString(userAgent) { | ||
indices = append(indices, rp.index) | ||
} | ||
default: | ||
panic("corrupt replaced: " + replaced) | ||
} | ||
|
||
replaced = replaced[start+numLen:] | ||
} | ||
|
||
return indices | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This works well, but requires maintenance as more patterns are added to the file